From 8e581522287cd9f9ca18ac4bf56944e8ac455ea6 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sat, 29 Mar 2025 13:49:54 -0700 Subject: [PATCH 01/10] shrink data for scalar instructions with contained const vectors --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 8 ++--- src/coreclr/jit/instr.cpp | 34 ++++++++++++++------- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index fabf3ec922bf42..31c70dc14a22fe 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1640,7 +1640,7 @@ class CodeGen final : public CodeGenInterface } }; - OperandDesc genOperandDesc(GenTree* op); + OperandDesc genOperandDesc(instruction ins, GenTree* op); void inst_TT(instruction ins, emitAttr size, GenTree* op1); void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 19792a61c4083e..dc50e1bd21cc2f 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1076,7 +1076,7 @@ void CodeGen::genHWIntrinsic_R_RM( instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc rmOpDesc = genOperandDesc(rmOp); + OperandDesc rmOpDesc = genOperandDesc(ins, rmOp); if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (rmOpDesc.GetKind() == OperandKind::Reg)) { @@ -1361,7 +1361,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); if (op2Desc.IsContained()) { @@ -1431,7 +1431,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM(instruction ins, instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc op3Desc = genOperandDesc(op3); + OperandDesc op3Desc = genOperandDesc(ins, op3); if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (op3Desc.GetKind() == OperandKind::Reg)) { @@ -1547,7 +1547,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I( instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc op3Desc = genOperandDesc(op3); + OperandDesc op3Desc = genOperandDesc(ins, op3); switch (op3Desc.GetKind()) { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 9d505f973bdfc7..d3dfd1bfff4ffa 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -809,6 +809,7 @@ void CodeGen::inst_RV_SH( // logic for determining what "kind" of operand "op" is. // // Arguments: +// ins - The instruction that will consume the operand. // op - The operand node for which to obtain the descriptor. // // Return Value: @@ -818,7 +819,7 @@ void CodeGen::inst_RV_SH( // This method is not idempotent - it can only be called once for a // given node. // -CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) +CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op) { if (!op->isContained() && !op->isUsedFromSpillTemp()) { @@ -915,7 +916,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) { // If the operand of broadcast is not a constant integer, // we handle all the other cases recursively. - return genOperandDesc(hwintrinsicChild); + return genOperandDesc(ins, hwintrinsicChild); } break; } @@ -935,7 +936,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(hwintrinsic->isContained()); op = hwintrinsic->Op(1); - return genOperandDesc(op); + return genOperandDesc(ins, op); } default: @@ -989,6 +990,20 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) #if defined(FEATURE_SIMD) case GT_CNS_VEC: { + insTupleType tupleType = emit->insTupleTypeInfo(ins); + + if ((tupleType == INS_TT_TUPLE1_SCALAR) || (tupleType == INS_TT_TUPLE1_FIXED)) + { + // We have a vector const, but the instruction will only read a scalar from it, + // so don't waste space putting the entire vector to the data section. + + unsigned cnsSize = max(CodeGenInterface::instInputSize(ins), 4U); + assert(cnsSize <= genTypeSize(op)); + + UNATIVE_OFFSET cnum = emit->emitDataConst(&op->AsVecCon()->gtSimdVal, cnsSize, cnsSize, TYP_UNDEF); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + switch (op->TypeGet()) { case TYP_SIMD8: @@ -1011,7 +1026,6 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) return OperandDesc(emit->emitSimd16Const(constValue)); } -#if defined(TARGET_XARCH) case TYP_SIMD32: { simd32_t constValue; @@ -1026,8 +1040,6 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) return OperandDesc(emit->emitSimd64Const(constValue)); } -#endif // TARGET_XARCH - default: { unreached(); @@ -1071,7 +1083,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) void CodeGen::inst_TT(instruction ins, emitAttr size, GenTree* op1) { emitter* emit = GetEmitter(); - OperandDesc op1Desc = genOperandDesc(op1); + OperandDesc op1Desc = genOperandDesc(ins, op1); switch (op1Desc.GetKind()) { @@ -1120,7 +1132,7 @@ void CodeGen::inst_TT(instruction ins, emitAttr size, GenTree* op1) void CodeGen::inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2) { emitter* emit = GetEmitter(); - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); switch (op2Desc.GetKind()) { @@ -1202,7 +1214,7 @@ void CodeGen::inst_RV_TT_IV( } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc rmOpDesc = genOperandDesc(rmOp); + OperandDesc rmOpDesc = genOperandDesc(ins, rmOp); switch (rmOpDesc.GetKind()) { @@ -1339,7 +1351,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); switch (op2Desc.GetKind()) { @@ -1426,7 +1438,7 @@ void CodeGen::inst_RV_RV_TT_IV(instruction ins, } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); switch (op2Desc.GetKind()) { From 8d2337364a8b53aaaa0dba896f3ebcbfac3d83e0 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sun, 30 Mar 2025 23:39:03 -0700 Subject: [PATCH 02/10] allow CreateScalarUnsafe to remain scalar const --- src/coreclr/jit/gentree.cpp | 14 +++++- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 6 ++- src/coreclr/jit/lower.cpp | 15 +++++- src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerxarch.cpp | 52 +++++++++++++++++++-- src/coreclr/jit/simd.h | 2 +- src/coreclr/jit/valuenum.cpp | 12 ++--- 7 files changed, 86 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6a062b02f2b12d..fc1ff7ac7f464a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -22969,7 +22969,19 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type, } } - return vecCon; + +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) + if (vecCon->IsZero() || vecCon->IsAllBitsSet()) +#endif // TARGET_XARCH || TARGET_ARM64 + { + return vecCon; + } +#if defined(TARGET_XARCH) + else + { + op1 = vecCon; + } +#endif // TARGET_XARCH } #if defined(TARGET_XARCH) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index dc50e1bd21cc2f..e556d032b47e06 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1898,11 +1898,15 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) op1 = loPart; } - ins = INS_movq; baseAttr = EA_8BYTE; } #endif // TARGET_X86 + if (op1->isUsedFromMemory() && (baseAttr == EA_8BYTE)) + { + ins = INS_movq; + } + genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, op1, instOptions); } else diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index a859ade2eb780e..e90416a70b14fc 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -9538,7 +9538,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing // void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) { -// LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from +// LA, RISC-V and ARM32 more likely to receive a terrible performance hit from // unaligned accesses making this optimization questionable. #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!comp->opts.OptimizationEnabled()) @@ -10006,7 +10006,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind) #endif // TODO-Cleanup: We're passing isContainable = true but ContainCheckIndir rejects - // address containment in some cases so we end up creating trivial (reg + offfset) + // address containment in some cases so we end up creating trivial (reg + offset) // or (reg + reg) LEAs that are not necessary. #if defined(TARGET_ARM64) @@ -11086,6 +11086,17 @@ GenTree* Lowering::InsertNewSimdCreateScalarUnsafeNode(var_types simdType, { BlockRange().Remove(op1); } + else + { + GenTree* resultOp1 = result->AsHWIntrinsic()->Op(1); + + if (resultOp1->IsCnsVec()) + { + BlockRange().Remove(op1); + BlockRange().InsertBefore(result, resultOp1); + } + } + return result; } #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index d44880bd947554..3792684bbdc0a1 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -132,6 +132,7 @@ class Lowering final : public Phase #ifdef TARGET_XARCH void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); void TryCompressConstVecData(GenTreeStoreInd* node); + GenTree* TryFoldHWIntrinsicToCnsVec(GenTree* childNode); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 0055b8693a32ad..13b3ec0fe6f0ed 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4044,7 +4044,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) BlockRange().Remove(node); - return LowerNode(vecCon); + return vecCon->gtNext; } else if (argCnt == 1) { @@ -9117,14 +9117,14 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: { + GenTree* op1 = hwintrinsic->Op(1); + if (!supportsSIMDScalarLoad) { - // Nothing to do if the intrinsic doesn't support scalar loads - return false; + // If we are wrapping a constant vector, we can still satisfy a full vector load. + return (supportsSIMDLoad && op1->IsCnsVec()); } - GenTree* op1 = hwintrinsic->Op(1); - if (IsInvariantInRange(op1, parentNode, hwintrinsic)) { if (op1->isContained()) @@ -9457,6 +9457,38 @@ void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) LowerNode(broadcast); } +//------------------------------------------------------------------------ +// TryFoldHWIntrinsicToCnsVec: Tries fold a HWIntrinsic node to a CnsVec +// +// Arguments: +// node - The node to fold to a constant if possible +// +GenTree* Lowering::TryFoldHWIntrinsicToCnsVec(GenTree* node) +{ + if (node->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* intrinsic = node->AsHWIntrinsic(); + + if (HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsic->GetHWIntrinsicId()) && intrinsic->Op(1)->IsCnsVec()) + { + node = intrinsic->Op(1); + + LIR::Use use; + if (BlockRange().TryGetUse(intrinsic, &use)) + { + use.ReplaceWith(node); + } + else + { + node->SetUnusedValue(); + } + BlockRange().Remove(intrinsic); + } + } + + return node; +} + //------------------------------------------------------------------------ // TryMakeSrcContainedOrRegOptional: Tries to make "childNode" a contained or regOptional node // @@ -9470,6 +9502,8 @@ void Lowering::TryMakeSrcContainedOrRegOptional(GenTreeHWIntrinsic* parentNode, if (IsContainableHWIntrinsicOp(parentNode, childNode, &supportsRegOptional)) { + childNode = TryFoldHWIntrinsicToCnsVec(childNode); + if (childNode->IsCnsVec() && parentNode->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { TryFoldCnsVecForEmbeddedBroadcast(parentNode, childNode->AsVecCon()); @@ -9915,6 +9949,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { + containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { @@ -10257,6 +10293,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { + containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { @@ -10341,6 +10379,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { + containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) { TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); @@ -11044,6 +11084,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { + containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) { TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 2f7610b7e6147c..b67b72e83fc88d 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1412,7 +1412,7 @@ void EvaluateWithElementFloating(var_types simdBaseType, TSimd* result, const TS case TYP_DOUBLE: { - result->f64[arg1] = static_cast(arg2); + result->f64[arg1] = arg2; break; } diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index c4c50a11d58cf5..e92e88cff08fdf 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -8004,12 +8004,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, return VNForLongCon(static_cast(result)); } - case NI_Vector128_AsVector2: - { - simd8_t result = GetConstantSimd16(arg0VN).v64[0]; - return VNForSimd8Con(result); - } - case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: { @@ -8064,6 +8058,12 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, } #endif // TARGET_XARCH + case NI_Vector128_AsVector2: + { + simd8_t result = GetConstantSimd16(arg0VN).v64[0]; + return VNForSimd8Con(result); + } + case NI_Vector128_AsVector3: { simd12_t result = {}; From dfa3ea6a61dc38eebfb008fc5bc8e140474dfe6d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sun, 30 Mar 2025 23:49:05 -0700 Subject: [PATCH 03/10] formatting --- src/coreclr/jit/gentree.cpp | 1 - src/coreclr/jit/lower.h | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index fc1ff7ac7f464a..902cb620eb7e52 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -22969,7 +22969,6 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type, } } - #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (vecCon->IsZero() || vecCon->IsAllBitsSet()) #endif // TARGET_XARCH || TARGET_ARM64 diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 3792684bbdc0a1..dc37010e6d1319 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -130,8 +130,8 @@ class Lowering final : public Phase void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr, unsigned size); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH - void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); - void TryCompressConstVecData(GenTreeStoreInd* node); + void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); + void TryCompressConstVecData(GenTreeStoreInd* node); GenTree* TryFoldHWIntrinsicToCnsVec(GenTree* childNode); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS From eee8b5add9411a482bedc612551e64e999b08332 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 31 Mar 2025 15:29:13 -0700 Subject: [PATCH 04/10] revert CreateScalarUnsafe changes --- src/coreclr/jit/gentree.cpp | 13 +-------- src/coreclr/jit/lower.cpp | 11 -------- src/coreclr/jit/lower.h | 5 ++-- src/coreclr/jit/lowerxarch.cpp | 50 +++------------------------------- 4 files changed, 7 insertions(+), 72 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 902cb620eb7e52..6a062b02f2b12d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -22969,18 +22969,7 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type, } } -#if defined(TARGET_XARCH) || defined(TARGET_ARM64) - if (vecCon->IsZero() || vecCon->IsAllBitsSet()) -#endif // TARGET_XARCH || TARGET_ARM64 - { - return vecCon; - } -#if defined(TARGET_XARCH) - else - { - op1 = vecCon; - } -#endif // TARGET_XARCH + return vecCon; } #if defined(TARGET_XARCH) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index e90416a70b14fc..cfb548fafe9b11 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11086,17 +11086,6 @@ GenTree* Lowering::InsertNewSimdCreateScalarUnsafeNode(var_types simdType, { BlockRange().Remove(op1); } - else - { - GenTree* resultOp1 = result->AsHWIntrinsic()->Op(1); - - if (resultOp1->IsCnsVec()) - { - BlockRange().Remove(op1); - BlockRange().InsertBefore(result, resultOp1); - } - } - return result; } #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index dc37010e6d1319..d44880bd947554 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -130,9 +130,8 @@ class Lowering final : public Phase void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr, unsigned size); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH - void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); - void TryCompressConstVecData(GenTreeStoreInd* node); - GenTree* TryFoldHWIntrinsicToCnsVec(GenTree* childNode); + void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); + void TryCompressConstVecData(GenTreeStoreInd* node); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 13b3ec0fe6f0ed..60df018d7b75dc 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9117,14 +9117,14 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: { - GenTree* op1 = hwintrinsic->Op(1); - if (!supportsSIMDScalarLoad) { - // If we are wrapping a constant vector, we can still satisfy a full vector load. - return (supportsSIMDLoad && op1->IsCnsVec()); + // Nothing to do if the intrinsic doesn't support scalar loads + return false; } + GenTree* op1 = hwintrinsic->Op(1); + if (IsInvariantInRange(op1, parentNode, hwintrinsic)) { if (op1->isContained()) @@ -9457,38 +9457,6 @@ void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) LowerNode(broadcast); } -//------------------------------------------------------------------------ -// TryFoldHWIntrinsicToCnsVec: Tries fold a HWIntrinsic node to a CnsVec -// -// Arguments: -// node - The node to fold to a constant if possible -// -GenTree* Lowering::TryFoldHWIntrinsicToCnsVec(GenTree* node) -{ - if (node->OperIsHWIntrinsic()) - { - GenTreeHWIntrinsic* intrinsic = node->AsHWIntrinsic(); - - if (HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsic->GetHWIntrinsicId()) && intrinsic->Op(1)->IsCnsVec()) - { - node = intrinsic->Op(1); - - LIR::Use use; - if (BlockRange().TryGetUse(intrinsic, &use)) - { - use.ReplaceWith(node); - } - else - { - node->SetUnusedValue(); - } - BlockRange().Remove(intrinsic); - } - } - - return node; -} - //------------------------------------------------------------------------ // TryMakeSrcContainedOrRegOptional: Tries to make "childNode" a contained or regOptional node // @@ -9502,8 +9470,6 @@ void Lowering::TryMakeSrcContainedOrRegOptional(GenTreeHWIntrinsic* parentNode, if (IsContainableHWIntrinsicOp(parentNode, childNode, &supportsRegOptional)) { - childNode = TryFoldHWIntrinsicToCnsVec(childNode); - if (childNode->IsCnsVec() && parentNode->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { TryFoldCnsVecForEmbeddedBroadcast(parentNode, childNode->AsVecCon()); @@ -9949,8 +9915,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { - containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); - if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { @@ -10293,8 +10257,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { - containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); - if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { @@ -10379,8 +10341,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { - containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); - if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) { TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); @@ -11084,8 +11044,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (containedOperand != nullptr) { - containedOperand = TryFoldHWIntrinsicToCnsVec(containedOperand); - if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) { TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); From 3810abac4be0c44407896b0f0389fb7e6fc8925c Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 1 Apr 2025 23:02:42 -0700 Subject: [PATCH 05/10] shrink constants in genSetRegToConst --- src/coreclr/jit/codegenarm64.cpp | 14 +-- src/coreclr/jit/codegenxarch.cpp | 21 ++-- src/coreclr/jit/emit.cpp | 119 +++++++++++++------- src/coreclr/jit/emit.h | 9 +- src/coreclr/jit/emitxarch.cpp | 6 +- src/coreclr/jit/gentree.cpp | 3 - src/coreclr/jit/hwintrinsic.cpp | 3 - src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 10 +- src/coreclr/jit/hwintrinsiclistxarch.h | 6 +- src/coreclr/jit/instr.cpp | 52 +-------- src/coreclr/jit/lower.h | 1 - src/coreclr/jit/lowerxarch.cpp | 118 ------------------- src/coreclr/jit/simdcodegenxarch.cpp | 4 +- 13 files changed, 108 insertions(+), 258 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index b472ab2675b27d..0d11bc4721c86e 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2482,19 +2482,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre { // Get a temp integer register to compute long address. regNumber addrReg = internalRegisters.GetSingle(tree); - CORINFO_FIELD_HANDLE hnd; - if (is8) - { - simd8_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t)); - hnd = emit->emitSimd8Const(constValue); - } - else - { - simd16_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t)); - hnd = emit->emitSimd16Const(constValue); - } + CORINFO_FIELD_HANDLE hnd = emit->emitSimdConst(&val, is8 ? EA_8BYTE : EA_16BYTE); emit->emitIns_R_C(INS_ldr, attr, targetReg, addrReg, hnd, 0); } } diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b02777d67ae517..8f58e1c0aaa097 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -453,8 +453,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -481,10 +480,9 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - simd16_t val16 = {}; + simd_t val16 = {}; memcpy(&val16, &val12, sizeof(val12)); - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, EA_16BYTE, targetReg); } break; } @@ -511,8 +509,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -539,8 +536,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -565,8 +561,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -7748,11 +7743,11 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) assert(!"genSSE2BitwiseOp: unsupported oper"); } - simd16_t constValue; + simd_t constValue = {}; constValue.u64[0] = mask; constValue.u64[1] = mask; #if defined(FEATURE_SIMD) - maskFld = GetEmitter()->emitSimd16Const(constValue); + maskFld = GetEmitter()->emitSimdConst(&constValue, EA_16BYTE); #else maskFld = GetEmitter()->emitBlkConst(&constValue, 16, 16, treeNode->TypeGet()); #endif diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 57ce4d313549a6..9a3deef00e3416 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8118,10 +8118,11 @@ CORINFO_FIELD_HANDLE emitter::emitFltOrDblConst(double constValue, emitAttr attr #if defined(FEATURE_SIMD) //------------------------------------------------------------------------ -// emitSimd8Const: Create a simd8 data section constant. +// emitSimdConst: Create a simd data section constant. // // Arguments: // constValue - constant value +// attr - The EA_SIZE for the constant type // // Return Value: // A field handle representing the data offset to access the constant. @@ -8131,10 +8132,11 @@ CORINFO_FIELD_HANDLE emitter::emitFltOrDblConst(double constValue, emitAttr attr // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference // to constant data, not a real static field. // -CORINFO_FIELD_HANDLE emitter::emitSimd8Const(simd8_t constValue) +CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr) { - unsigned cnsSize = 8; - unsigned cnsAlign = cnsSize; + unsigned cnsSize = EA_SIZE(attr); + unsigned cnsAlign = cnsSize; + var_types dataType = (cnsSize >= 8) ? emitComp->getSIMDTypeForSize(cnsSize) : TYP_FLOAT; #ifdef TARGET_XARCH if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) @@ -8143,56 +8145,95 @@ CORINFO_FIELD_HANDLE emitter::emitSimd8Const(simd8_t constValue) } #endif // TARGET_XARCH - UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD8); + UNATIVE_OFFSET cnum = emitDataConst(constValue, cnsSize, cnsAlign, dataType); return emitComp->eeFindJitDataOffs(cnum); } -CORINFO_FIELD_HANDLE emitter::emitSimd16Const(simd16_t constValue) +#ifdef TARGET_XARCH +//------------------------------------------------------------------------ +// emitSimdConstCompressedLoad: Create a simd data section constant, +// compressing it if possible, and emit an appropiate instruction +// to load or broadcast the constant to a register. +// +// Arguments: +// constValue - constant value +// attr - The EA_SIZE for the constant type +// targetReg - The target register +// +void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg) { - unsigned cnsSize = 16; - unsigned cnsAlign = cnsSize; + instruction ins = INS_movups; + unsigned cnsSize = EA_SIZE(attr); -#ifdef TARGET_XARCH - if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) + for (unsigned size = 4; size < cnsSize; size *= 2) { - cnsAlign = dataSection::MIN_DATA_ALIGN; - } -#endif // TARGET_XARCH + // If the constant has zero upper elements/lanes, we can use a smaller load, + // because all scalar and vector loads from memory zero the upper. - UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD16); - return emitComp->eeFindJitDataOffs(cnum); -} + simd_t val = {}; + memcpy(&val, constValue, size); -#if defined(TARGET_XARCH) -CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue) -{ - unsigned cnsSize = 32; - unsigned cnsAlign = cnsSize; + if (memcmp(&val, constValue, cnsSize) == 0) + { + switch (size) + { + case 4: + ins = INS_movss; + break; + case 8: + ins = INS_movsd_simd; + break; + default: + ins = INS_movups; + break; + } - if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) - { - cnsAlign = dataSection::MIN_DATA_ALIGN; - } + cnsSize = size; + attr = EA_ATTR(size); + break; + } - UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD32); - return emitComp->eeFindJitDataOffs(cnum); -} + if (((size == 8) && (cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) || + emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + // If the value repeats, we can use an appropriate-sized broadcast instruction. -CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue) -{ - unsigned cnsSize = 64; - unsigned cnsAlign = cnsSize; + for (unsigned i = 1; i < (cnsSize / size); i++) + { + memcpy((simd_t*)((byte*)&val + (i * size)), constValue, size); + } - if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) - { - cnsAlign = dataSection::MIN_DATA_ALIGN; + if (memcmp(&val, constValue, cnsSize) == 0) + { + switch (size) + { + case 4: + ins = INS_vbroadcastss; + break; + case 8: + ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd; + break; + case 16: + ins = INS_vbroadcastf128; + break; + case 32: + assert(emitComp->IsBaselineVector512IsaSupportedDebugOnly()); + ins = INS_vbroadcastf32x8; + break; + default: + unreached(); + } + + cnsSize = size; + break; + } + } } - UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64); - return emitComp->eeFindJitDataOffs(cnum); + CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, EA_ATTR(cnsSize)); + emitIns_R_C(ins, attr, targetReg, hnd, 0); } - -#endif // TARGET_XARCH +#endif #if defined(FEATURE_MASKED_HW_INTRINSICS) CORINFO_FIELD_HANDLE emitter::emitSimdMaskConst(simdmask_t constValue) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index ef1fd2f701fc15..74d4f5cae5b2c2 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2585,13 +2585,10 @@ class emitter CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr); #if defined(FEATURE_SIMD) - CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); - CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); + CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr); #if defined(TARGET_XARCH) - CORINFO_FIELD_HANDLE emitSimd32Const(simd32_t constValue); - CORINFO_FIELD_HANDLE emitSimd64Const(simd64_t constValue); -#endif // TARGET_XARCH - + void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); +#endif #if defined(FEATURE_MASKED_HW_INTRINSICS) CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b227f628f79693..5369549b83fbf8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -7331,6 +7331,7 @@ bool emitter::IsMovInstruction(instruction ins) case INS_vmovdqu8: case INS_vmovdqu16: case INS_vmovdqu64: + case INS_movq: case INS_movsd_simd: case INS_movss: case INS_movsx: @@ -7350,7 +7351,6 @@ bool emitter::IsMovInstruction(instruction ins) } #if defined(TARGET_AMD64) - case INS_movq: case INS_movsxd: { return true; @@ -7501,7 +7501,6 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } -#if defined(TARGET_AMD64) case INS_movq: { // Clears the upper bits @@ -7509,6 +7508,7 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } +#if defined(TARGET_AMD64) case INS_movsxd: { // Sign-extends the source @@ -7781,13 +7781,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN break; } -#if defined(TARGET_AMD64) case INS_movq: { assert(isFloatReg(dstReg) && isFloatReg(srcReg)); break; } +#if defined(TARGET_AMD64) case INS_movsxd: { assert(isGeneralRegister(dstReg) && isGeneralRegister(srcReg)); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6a062b02f2b12d..1745c07366a7d9 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28266,9 +28266,6 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: if (GetAuxiliaryJitType() == CORINFO_TYPE_PTR) { addr = Op(1); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index a00d57962d757b..f7eddafc80f6be 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2078,9 +2078,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: { // These intrinsics have both pointer and vector overloads // We want to be able to differentiate between them so lets diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index e556d032b47e06..dcabb732da4e34 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1949,14 +1949,14 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) else { assert(targetReg != op1Reg); - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg, instOptions); + instGen_Set_Reg_To_Zero(attr, targetReg); emit->emitIns_Mov(INS_movss, attr, targetReg, op1Reg, /* canSkip */ false); } } else { // `movq xmm xmm` zeroes the upper 64 bits. - genHWIntrinsic_R_RM(node, INS_movq, attr, targetReg, op1, instOptions); + emit->emitIns_Mov(INS_movq, attr, targetReg, op1Reg, /* canSkip */ false); } break; } @@ -2285,10 +2285,8 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { minValueInt.i32[i] = INT_MIN; } - CORINFO_FIELD_HANDLE minValueFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(minValueInt.v128[0]) - : emit->emitSimd32Const(minValueInt.v256[0]); - CORINFO_FIELD_HANDLE negOneFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(negOneIntVec.v128[0]) - : emit->emitSimd32Const(negOneIntVec.v256[0]); + CORINFO_FIELD_HANDLE minValueFld = emit->emitSimdConst(&minValueInt, typeSize); + CORINFO_FIELD_HANDLE negOneFld = emit->emitSimdConst(&negOneIntVec, typeSize); // div-by-zero check emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index a49d4b4bdc66bf..1750fed537f1b1 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -843,7 +843,7 @@ HARDWARE_INTRINSIC(AVX2, Blend, HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -915,8 +915,8 @@ HARDWARE_INTRINSIC(AVX512F, And, HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, Compare, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d3dfd1bfff4ffa..ea6f8a38eb549b 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -991,69 +991,25 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op) case GT_CNS_VEC: { insTupleType tupleType = emit->insTupleTypeInfo(ins); + unsigned cnsSize = genTypeSize(op); if ((tupleType == INS_TT_TUPLE1_SCALAR) || (tupleType == INS_TT_TUPLE1_FIXED)) { // We have a vector const, but the instruction will only read a scalar from it, // so don't waste space putting the entire vector to the data section. - unsigned cnsSize = max(CodeGenInterface::instInputSize(ins), 4U); + cnsSize = max(CodeGenInterface::instInputSize(ins), 4U); assert(cnsSize <= genTypeSize(op)); - - UNATIVE_OFFSET cnum = emit->emitDataConst(&op->AsVecCon()->gtSimdVal, cnsSize, cnsSize, TYP_UNDEF); - return OperandDesc(compiler->eeFindJitDataOffs(cnum)); } - switch (op->TypeGet()) - { - case TYP_SIMD8: - { - simd8_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd8_t)); - return OperandDesc(emit->emitSimd8Const(constValue)); - } - - case TYP_SIMD12: - { - simd16_t constValue = {}; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd12_t)); - return OperandDesc(emit->emitSimd16Const(constValue)); - } - case TYP_SIMD16: - { - simd16_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd16_t)); - return OperandDesc(emit->emitSimd16Const(constValue)); - } - - case TYP_SIMD32: - { - simd32_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd32_t)); - return OperandDesc(emit->emitSimd32Const(constValue)); - } - - case TYP_SIMD64: - { - simd64_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd64_t)); - return OperandDesc(emit->emitSimd64Const(constValue)); - } - - default: - { - unreached(); - } - } + return OperandDesc(emit->emitSimdConst(&op->AsVecCon()->gtSimdVal, EA_TYPE(cnsSize))); } #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) case GT_CNS_MSK: { - simdmask_t constValue; - memcpy(&constValue, &op->AsMskCon()->gtSimdMaskVal, sizeof(simdmask_t)); - return OperandDesc(emit->emitSimdMaskConst(constValue)); + return OperandDesc(emit->emitSimdMaskConst(op->AsMskCon()->gtSimdMaskVal)); } #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index d44880bd947554..60afd35e73c5b4 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -131,7 +131,6 @@ class Lowering final : public Phase void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); - void TryCompressConstVecData(GenTreeStoreInd* node); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 60df018d7b75dc..67fefa850ce11d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -107,30 +107,6 @@ GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node) } ContainCheckStoreIndir(node); -#if defined(FEATURE_HW_INTRINSICS) - if (comp->IsBaselineVector512IsaSupportedOpportunistically() || - comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - if (!node->Data()->IsCnsVec()) - { - return node->gtNext; - } - - if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)) - { - return node->gtNext; - } - - if (node->Data()->IsVectorAllBitsSet() || node->Data()->IsVectorZero()) - { - // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. - return node->gtNext; - } - - TryCompressConstVecData(node); - } -#endif - return node->gtNext; } @@ -8819,9 +8795,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: { // These can have either pointer or vector operands. For the pointer case, we can't check // size, so just assume it matches. Otherwise, do normal size check based on tuple type. @@ -9380,83 +9353,6 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, MakeSrcContained(parentNode, childNode); } -//---------------------------------------------------------------------------------------------- -// TryCompressConstVecData: -// Try to compress the constant vector input if it has duplicated parts and can be optimized by -// broadcast -// -// Arguments: -// node - the storeind node. -// -// Return: -// return true if compress success. -void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) -{ - assert(node->Data()->IsCnsVec()); - assert(node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)); - - GenTreeVecCon* vecCon = node->Data()->AsVecCon(); - GenTreeHWIntrinsic* broadcast = nullptr; - - if (vecCon->TypeIs(TYP_SIMD32)) - { - assert(comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)); - if (vecCon->gtSimd32Val.v128[0] == vecCon->gtSimdVal.v128[1]) - { - simd16_t simd16Val = {}; - simd16Val.f64[0] = vecCon->gtSimd32Val.f64[0]; - simd16Val.f64[1] = vecCon->gtSimd32Val.f64[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); - memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, compressedVecCon, - NI_AVX2_BroadcastVector128ToVector256, CORINFO_TYPE_UINT, 32); - } - } - else - { - assert(vecCon->TypeIs(TYP_SIMD64)); - assert(comp->IsBaselineVector512IsaSupportedOpportunistically()); - if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) - { - simd16_t simd16Val = {}; - simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; - simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); - memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, - NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); - } - else if (vecCon->gtSimd64Val.v256[0] == vecCon->gtSimd64Val.v256[1]) - { - simd32_t simd32Val = {}; - simd32Val.v128[0] = vecCon->gtSimd32Val.v128[0]; - simd32Val.v128[1] = vecCon->gtSimd32Val.v128[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD32); - memcpy(&compressedVecCon->gtSimdVal, &simd32Val, sizeof(simd32_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - broadcast = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector256ToVector512, - CORINFO_TYPE_ULONG, 64); - } - } - - if (broadcast == nullptr) - { - return; - } - - BlockRange().InsertBefore(node, broadcast); - node->Data() = broadcast; - LowerNode(broadcast); -} - //------------------------------------------------------------------------ // TryMakeSrcContainedOrRegOptional: Tries to make "childNode" a contained or regOptional node // @@ -9710,20 +9606,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: - { - if (node->OperIsMemoryLoad()) - { - ContainCheckHWIntrinsicAddr(node, op1, /* conservative maximum */ 32); - return; - } - - assert(op1->IsCnsVec()); - break; - } - case NI_AVX512F_ConvertToVector256Int32: case NI_AVX512F_ConvertToVector256UInt32: case NI_AVX512F_VL_ConvertToVector128UInt32: diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index 57fcb5d4688004..e95dea7d63c246 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -536,12 +536,12 @@ void CodeGen::genSimd12UpperClear(regNumber tgtReg) else { // Preserve element 0, 1, and 2; Zero element 3 - simd16_t constValue; + simd_t constValue = {}; constValue.u32[0] = 0xFFFFFFFF; constValue.u32[1] = 0xFFFFFFFF; constValue.u32[2] = 0xFFFFFFFF; constValue.u32[3] = 0x00000000; - CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue); + CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimdConst(&constValue, EA_16BYTE); GetEmitter()->emitIns_SIMD_R_R_C(INS_andps, EA_16BYTE, tgtReg, tgtReg, zroSimd12Elm3, 0, INS_OPTS_NONE); } } From 3632656d02d0adf93416afc91b25e55df0af2fd5 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 1 Apr 2025 23:21:53 -0700 Subject: [PATCH 06/10] fix build --- src/coreclr/jit/emit.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 9a3deef00e3416..eb159fed1f497f 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8200,7 +8200,7 @@ void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, reg for (unsigned i = 1; i < (cnsSize / size); i++) { - memcpy((simd_t*)((byte*)&val + (i * size)), constValue, size); + memcpy((simd_t*)((uint8_t*)&val + (i * size)), constValue, size); } if (memcmp(&val, constValue, cnsSize) == 0) From 37f0a7d792917a4f4d19a384e0e4be83df858500 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 2 Apr 2025 00:50:53 -0700 Subject: [PATCH 07/10] fix throughput regression --- src/coreclr/jit/codegenarm64.cpp | 14 +++++- src/coreclr/jit/codegenxarch.cpp | 4 +- src/coreclr/jit/emit.cpp | 48 ++++++++++++++++++++- src/coreclr/jit/emit.h | 6 ++- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 2 +- src/coreclr/jit/simdcodegenxarch.cpp | 4 +- 6 files changed, 69 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 0d11bc4721c86e..b472ab2675b27d 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2482,7 +2482,19 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre { // Get a temp integer register to compute long address. regNumber addrReg = internalRegisters.GetSingle(tree); - CORINFO_FIELD_HANDLE hnd = emit->emitSimdConst(&val, is8 ? EA_8BYTE : EA_16BYTE); + CORINFO_FIELD_HANDLE hnd; + if (is8) + { + simd8_t constValue; + memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t)); + hnd = emit->emitSimd8Const(constValue); + } + else + { + simd16_t constValue; + memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t)); + hnd = emit->emitSimd16Const(constValue); + } emit->emitIns_R_C(INS_ldr, attr, targetReg, addrReg, hnd, 0); } } diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 8f58e1c0aaa097..28734180af6b66 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7743,11 +7743,11 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) assert(!"genSSE2BitwiseOp: unsupported oper"); } - simd_t constValue = {}; + simd16_t constValue; constValue.u64[0] = mask; constValue.u64[1] = mask; #if defined(FEATURE_SIMD) - maskFld = GetEmitter()->emitSimdConst(&constValue, EA_16BYTE); + maskFld = GetEmitter()->emitSimd16Const(constValue); #else maskFld = GetEmitter()->emitBlkConst(&constValue, 16, 16, treeNode->TypeGet()); #endif diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index eb159fed1f497f..8c36d1db0bcd51 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8118,6 +8118,53 @@ CORINFO_FIELD_HANDLE emitter::emitFltOrDblConst(double constValue, emitAttr attr #if defined(FEATURE_SIMD) //------------------------------------------------------------------------ +// emitSimd8Const: Create a simd8 data section constant. +// +// Arguments: +// constValue - constant value +// +// Return Value: +// A field handle representing the data offset to access the constant. +// +// Note: +// Access to inline data is 'abstracted' by a special type of static member +// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference +// to constant data, not a real static field. +// +CORINFO_FIELD_HANDLE emitter::emitSimd8Const(simd8_t constValue) +{ + unsigned cnsSize = 8; + unsigned cnsAlign = cnsSize; + +#ifdef TARGET_XARCH + if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) + { + cnsAlign = dataSection::MIN_DATA_ALIGN; + } +#endif // TARGET_XARCH + + UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD8); + return emitComp->eeFindJitDataOffs(cnum); +} + +CORINFO_FIELD_HANDLE emitter::emitSimd16Const(simd16_t constValue) +{ + unsigned cnsSize = 16; + unsigned cnsAlign = cnsSize; + +#ifdef TARGET_XARCH + if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) + { + cnsAlign = dataSection::MIN_DATA_ALIGN; + } +#endif // TARGET_XARCH + + UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD16); + return emitComp->eeFindJitDataOffs(cnum); +} + +#ifdef TARGET_XARCH +//------------------------------------------------------------------------ // emitSimdConst: Create a simd data section constant. // // Arguments: @@ -8149,7 +8196,6 @@ CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr) return emitComp->eeFindJitDataOffs(cnum); } -#ifdef TARGET_XARCH //------------------------------------------------------------------------ // emitSimdConstCompressedLoad: Create a simd data section constant, // compressing it if possible, and emit an appropiate instruction diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 74d4f5cae5b2c2..2c47f85033dffc 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2585,9 +2585,11 @@ class emitter CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr); #if defined(FEATURE_SIMD) - CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr); + CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); + CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); #if defined(TARGET_XARCH) - void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); + CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr); + void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); #endif #if defined(FEATURE_MASKED_HW_INTRINSICS) CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index dcabb732da4e34..ea114113412d81 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1949,7 +1949,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) else { assert(targetReg != op1Reg); - instGen_Set_Reg_To_Zero(attr, targetReg); + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg, instOptions); emit->emitIns_Mov(INS_movss, attr, targetReg, op1Reg, /* canSkip */ false); } } diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index e95dea7d63c246..57fcb5d4688004 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -536,12 +536,12 @@ void CodeGen::genSimd12UpperClear(regNumber tgtReg) else { // Preserve element 0, 1, and 2; Zero element 3 - simd_t constValue = {}; + simd16_t constValue; constValue.u32[0] = 0xFFFFFFFF; constValue.u32[1] = 0xFFFFFFFF; constValue.u32[2] = 0xFFFFFFFF; constValue.u32[3] = 0x00000000; - CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimdConst(&constValue, EA_16BYTE); + CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue); GetEmitter()->emitIns_SIMD_R_R_C(INS_andps, EA_16BYTE, tgtReg, tgtReg, zroSimd12Elm3, 0, INS_OPTS_NONE); } } From 2c269a34704599870a013c172aad092200028373 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 2 Apr 2025 02:15:31 -0700 Subject: [PATCH 08/10] use correct instruction for SIMD8 --- src/coreclr/jit/emit.cpp | 3 ++- src/coreclr/jit/emit.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 6c06f3394f5f7c..ed05c0a4a603a1 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8208,8 +8208,9 @@ CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr) // void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg) { - instruction ins = INS_movups; + assert(EA_SIZE(attr) >= 8); unsigned cnsSize = EA_SIZE(attr); + instruction ins = (cnsSize == 8) ? INS_movsd_simd : INS_movups; for (unsigned size = 4; size < cnsSize; size *= 2) { diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 92a9b4d005b21d..7e3d28d5be7a63 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2590,7 +2590,7 @@ class emitter #if defined(TARGET_XARCH) CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr); void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); -#endif // TARGET_XRACH +#endif // TARGET_XARCH #if defined(FEATURE_MASKED_HW_INTRINSICS) CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); #endif // FEATURE_MASKED_HW_INTRINSICS From 49bf58f114837fc0f3966658a8f2aa22098f9554 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 2 Apr 2025 15:07:38 -0700 Subject: [PATCH 09/10] update compress logic --- src/coreclr/jit/emit.cpp | 130 ++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 57 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index ed05c0a4a603a1..009ddde60d7078 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8208,76 +8208,92 @@ CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr) // void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg) { - assert(EA_SIZE(attr) >= 8); - unsigned cnsSize = EA_SIZE(attr); - instruction ins = (cnsSize == 8) ? INS_movsd_simd : INS_movups; + assert(EA_SIZE(attr) >= 8 && EA_SIZE(attr) <= 64); - for (unsigned size = 4; size < cnsSize; size *= 2) + unsigned cnsSize = EA_SIZE(attr); + unsigned dataSize = cnsSize; + instruction ins = (cnsSize == 8) ? INS_movsd_simd : INS_movups; + + // Most constant vectors tend to have repeated values, so we will first check to see if + // we can replace a full vector load with a smaller broadcast. + + if ((dataSize == 64) && (constValue->v256[1] == constValue->v256[0])) { - // If the constant has zero upper elements/lanes, we can use a smaller load, - // because all scalar and vector loads from memory zero the upper. + assert(emitComp->IsBaselineVector512IsaSupportedDebugOnly()); + dataSize = 32; + ins = INS_vbroadcastf32x8; + } - simd_t val = {}; - memcpy(&val, constValue, size); + if ((dataSize == 32) && (constValue->v128[1] == constValue->v128[0])) + { + assert(emitComp->IsBaselineVector256IsaSupportedDebugOnly()); + dataSize = 16; + ins = INS_vbroadcastf128; + } - if (memcmp(&val, constValue, cnsSize) == 0) + if ((dataSize == 16) && (constValue->u64[1] == constValue->u64[0])) + { + if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) || + emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) { - switch (size) - { - case 4: - ins = INS_movss; - break; - case 8: - ins = INS_movsd_simd; - break; - default: - ins = INS_movups; - break; - } - - cnsSize = size; - attr = EA_ATTR(size); - break; + dataSize = 8; + ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd; } + } - if (((size == 8) && (cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) || - emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + if ((dataSize == 8) && (cnsSize >= 16) && (constValue->u32[1] == constValue->u32[0])) + { + if (emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) { - // If the value repeats, we can use an appropriate-sized broadcast instruction. + dataSize = 4; + ins = INS_vbroadcastss; + } + } - for (unsigned i = 1; i < (cnsSize / size); i++) - { - memcpy((simd_t*)((uint8_t*)&val + (i * size)), constValue, size); - } + if (dataSize < cnsSize) + { + // We found a broadcast match, so emit the broadcast instruction and return. + // Here we use the original emitAttr for the instruction, because we need to + // produce a register of the original constant's size, filled with the pattern. - if (memcmp(&val, constValue, cnsSize) == 0) - { - switch (size) - { - case 4: - ins = INS_vbroadcastss; - break; - case 8: - ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd; - break; - case 16: - ins = INS_vbroadcastf128; - break; - case 32: - assert(emitComp->IsBaselineVector512IsaSupportedDebugOnly()); - ins = INS_vbroadcastf32x8; - break; - default: - unreached(); - } + CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, EA_ATTR(dataSize)); + emitIns_R_C(ins, attr, targetReg, hnd, 0); + return; + } - cnsSize = size; - break; - } - } + // Otherwise, if the upper lanes and/or elements of the constant are zero, we can use a + // smaller load, because all scalar and vector memory load instructions zero the uppers. + + simd32_t zeroValue = {}; + + if ((dataSize == 64) && (constValue->v256[1] == zeroValue)) + { + dataSize = 32; + } + + if ((dataSize == 32) && (constValue->v128[1] == zeroValue.v128[0])) + { + dataSize = 16; + } + + if ((dataSize == 16) && (constValue->u64[1] == 0)) + { + dataSize = 8; + ins = INS_movsd_simd; + } + + if ((dataSize == 8) && (constValue->u32[1] == 0)) + { + dataSize = 4; + ins = INS_movss; } - CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, EA_ATTR(cnsSize)); + // Here we set the emitAttr to the size of the actual load. It will zero extend + // up to the native SIMD register size. + + attr = EA_ATTR(dataSize); + + CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, attr); emitIns_R_C(ins, attr, targetReg, hnd, 0); } #endif // TARGET_XARCH From a2fe9f8b02d7ec8474570c1079ba81c45846fdd9 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 10 Apr 2025 15:38:42 -0700 Subject: [PATCH 10/10] add comment --- src/coreclr/jit/emit.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 038fea70cd98cd..0e30b1258913ba 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8241,6 +8241,9 @@ void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, reg } } + // `vbroadcastss` fills the full SIMD register, so we can't do this last step if the + // original constant was smaller than a full reg (e.g. TYP_SIMD8) + if ((dataSize == 8) && (cnsSize >= 16) && (constValue->u32[1] == constValue->u32[0])) { if (emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX))