Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void genLoadIndTypeSimd12(GenTreeIndir* treeNode);
void genStoreLclTypeSimd12(GenTreeLclVarCommon* treeNode);
void genLoadLclTypeSimd12(GenTreeLclVarCommon* treeNode);
#ifdef TARGET_XARCH
void genEmitStoreLclTypeSimd12(GenTree* store, unsigned lclNum, unsigned offset);
void genEmitLoadLclTypeSimd12(regNumber tgtReg, unsigned lclNum, unsigned offset);
#endif // TARGET_XARCH
#ifdef TARGET_X86
void genStoreSimd12ToStack(regNumber dataReg, regNumber tmpReg);
void genPutArgStkSimd12(GenTreePutArgStk* treeNode);
Expand Down
11 changes: 10 additions & 1 deletion src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7762,6 +7762,11 @@ GenTreeField* Compiler::gtNewFieldRef(var_types type, CORINFO_FIELD_HANDLE fldHn
fieldNode->gtFlags |= GTF_GLOB_REF;
}

if ((obj != nullptr) && fgAddrCouldBeNull(obj))
{
fieldNode->gtFlags |= GTF_EXCEPT;
}

return fieldNode;
}

Expand Down Expand Up @@ -7794,7 +7799,11 @@ GenTreeField* Compiler::gtNewFieldAddrNode(var_types type, CORINFO_FIELD_HANDLE
varDsc->lvFieldAccessed = 1;
}

// TODO-ADDR: add GTF_EXCEPT handling here and delete it from callers.
if ((obj != nullptr) && fgAddrCouldBeNull(obj))
{
fieldNode->gtFlags |= GTF_EXCEPT;
}

return fieldNode;
}

Expand Down
16 changes: 0 additions & 16 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6846,12 +6846,6 @@ void Compiler::impImportBlockCode(BasicBlock* block)
tiRetVal = se.seTypeInfo;
}

// TODO-ADDR: delete once all RET_EXPRs are typed properly.
if (varTypeIsSIMD(lclTyp))
{
op1->gtType = lclTyp;
}

// Note this will downcast TYP_I_IMPL into a 32-bit Int on 64 bit (for x86 JIT compatibility).
op1 = impImplicitIorI4Cast(op1, lclTyp);
op1 = impImplicitR4orR8Cast(op1, lclTyp);
Expand Down Expand Up @@ -9375,11 +9369,6 @@ void Compiler::impImportBlockCode(BasicBlock* block)
}
#endif

if (fgAddrCouldBeNull(obj))
{
op1->gtFlags |= GTF_EXCEPT;
}

if (StructHasOverlappingFields(info.compCompHnd->getClassAttribs(resolvedToken.hClass)))
{
op1->AsField()->gtFldMayOverlap = true;
Expand Down Expand Up @@ -9664,11 +9653,6 @@ void Compiler::impImportBlockCode(BasicBlock* block)
}
#endif

if (fgAddrCouldBeNull(obj))
{
op1->gtFlags |= GTF_EXCEPT;
}

if (compIsForInlining() &&
impInlineIsGuaranteedThisDerefBeforeAnySideEffects(op2, nullptr, obj,
impInlineInfo->inlArgInfo))
Expand Down
82 changes: 4 additions & 78 deletions src/coreclr/jit/rationalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,82 +28,6 @@ void copyFlags(GenTree* dst, GenTree* src, GenTreeFlags mask)
dst->gtFlags |= (src->gtFlags & mask);
}

// RewriteIndir: Rewrite an indirection and clear the flags that should not be set after rationalize.
//
// Arguments:
// use - A use of an indirection node.
//
void Rationalizer::RewriteIndir(LIR::Use& use)
{
GenTreeIndir* indir = use.Def()->AsIndir();
assert(indir->OperIs(GT_IND, GT_BLK));

if (varTypeIsSIMD(indir))
{
if (indir->OperIs(GT_BLK))
{
indir->SetOper(GT_IND);
}

RewriteSIMDIndir(use);
}
}

// RewriteSIMDIndir: Rewrite a SIMD indirection as a simple lclVar if possible.
//
// Arguments:
// use - A use of a GT_IND node of SIMD type
//
// TODO-ADDR: today this only exists because the xarch backend does not handle
// IND<simd12>(LCL_VAR_ADDR/LCL_FLD_ADDR) when the address is contained correctly.
//
void Rationalizer::RewriteSIMDIndir(LIR::Use& use)
{
#ifdef FEATURE_SIMD
GenTreeIndir* indir = use.Def()->AsIndir();
assert(indir->OperIs(GT_IND));
var_types simdType = indir->TypeGet();
assert(varTypeIsSIMD(simdType));

GenTree* addr = indir->Addr();

if (addr->IsLclVarAddr() && varTypeIsSIMD(comp->lvaGetDesc(addr->AsLclFld())))
{
// If we have GT_IND(GT_LCL_ADDR<0>) and the var is a SIMD type,
// replace the expression by GT_LCL_VAR or GT_LCL_FLD.
BlockRange().Remove(indir);

const unsigned lclNum = addr->AsLclFld()->GetLclNum();
LclVarDsc* varDsc = comp->lvaGetDesc(lclNum);

var_types lclType = varDsc->TypeGet();

if (lclType == simdType)
{
addr->SetOper(GT_LCL_VAR);
}
else
{
addr->SetOper(GT_LCL_FLD);

if (((addr->gtFlags & GTF_VAR_DEF) != 0) && (genTypeSize(simdType) < genTypeSize(lclType)))
{
addr->gtFlags |= GTF_VAR_USEASG;
}

comp->lvaSetVarDoNotEnregister(lclNum DEBUGARG(DoNotEnregisterReason::LocalField));
}
if (varDsc->lvPromoted)
{
comp->lvaSetVarDoNotEnregister(lclNum DEBUGARG(DoNotEnregisterReason::BlockOp));
}

addr->gtType = simdType;
use.ReplaceWith(addr);
}
#endif // FEATURE_SIMD
}

// RewriteNodeAsCall : Replace the given tree node by a GT_CALL.
//
// Arguments:
Expand Down Expand Up @@ -469,9 +393,11 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, Compiler::Ge
RewriteAssignment(use);
break;

case GT_IND:
case GT_BLK:
RewriteIndir(use);
if (varTypeIsSIMD(node))
{
node->SetOper(GT_IND);
}
break;

case GT_CALL:
Expand Down
5 changes: 0 additions & 5 deletions src/coreclr/jit/rationalize.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,6 @@ class Rationalizer final : public Phase
return LIR::AsRange(m_block);
}

void RewriteIndir(LIR::Use& use);

// SIMD related
void RewriteSIMDIndir(LIR::Use& use);

// Intrinsic related transformations
void RewriteNodeAsCall(GenTree** use,
ArrayStack<GenTree*>& parents,
Expand Down
129 changes: 72 additions & 57 deletions src/coreclr/jit/simdcodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#include "gcinfo.h"
#include "gcinfoencoder.h"

// Instruction immediates

// Insertps:
// - bits 6 and 7 of the immediate indicate which source item to select (0..3)
// - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
// - bits 0 to 3 of the immediate indicate which target item to zero
#define INSERTPS_SOURCE_SELECT(i) ((i) << 6)
#define INSERTPS_TARGET_SELECT(i) ((i) << 4)
#define INSERTPS_ZERO(i) (1 << (i))

// ROUNDPS/PD:
// - Bit 0 through 1 - Rounding mode
// * 0b00 - Round to nearest (even)
// * 0b01 - Round toward Neg. Infinity
// * 0b10 - Round toward Pos. Infinity
// * 0b11 - Round toward zero (Truncate)
// - Bit 2 - Source of rounding control, 0b0 for immediate.
// - Bit 3 - Precision exception, 0b1 to ignore. (We don't raise FP exceptions)
#define ROUNDPS_TO_NEAREST_IMM 0b1000
#define ROUNDPS_TOWARD_NEGATIVE_INFINITY_IMM 0b1001
#define ROUNDPS_TOWARD_POSITIVE_INFINITY_IMM 0b1010
#define ROUNDPS_TOWARD_ZERO_IMM 0b1011

//-----------------------------------------------------------------------------
// genStoreIndTypeSimd12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
// Since Vector3 is not a hardware supported write size, it is performed
Expand All @@ -70,6 +47,13 @@ void CodeGen::genStoreIndTypeSimd12(GenTreeStoreInd* treeNode)
GenTree* addr = treeNode->Addr();
genConsumeAddress(addr);

if (addr->isContained() && addr->OperIs(GT_LCL_ADDR))
{
genEmitStoreLclTypeSimd12(treeNode, addr->AsLclFld()->GetLclNum(), addr->AsLclFld()->GetLclOffs());
genUpdateLife(treeNode);
return;
}
Comment on lines +50 to +55
Copy link
Contributor Author

@SingleAccretion SingleAccretion Apr 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative fix to this would have been to simply disallow containment of local addresses for SIMD12s.

This would work because accurate liveness tracking for SIMD12s is neither used nor important, but it felt more consistent with the rest of codegen to handle it. Perhaps, one day, we'll delete this code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's quite confusing that emitHandleMemOp does not support contained LCL_ADDR nodes when Lowering::ContainCheckIndir does. I wonder if we have more bugs around this.


GenTree* data = treeNode->Data();
regNumber dataReg = genConsumeReg(data);

Expand Down Expand Up @@ -142,6 +126,13 @@ void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode)
GenTree* addr = treeNode->Addr();
genConsumeAddress(addr);

if (addr->isContained() && addr->OperIs(GT_LCL_ADDR))
{
genEmitLoadLclTypeSimd12(treeNode->GetRegNum(), addr->AsLclFld()->GetLclNum(), addr->AsLclFld()->GetLclOffs());
genProduceReg(treeNode);
return;
}

emitter* emit = GetEmitter();
regNumber tgtReg = treeNode->GetRegNum();
bool useSse41 = compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41);
Expand Down Expand Up @@ -249,29 +240,7 @@ void CodeGen::genStoreLclTypeSimd12(GenTreeLclVarCommon* treeNode)
}
else
{
// Store lower 8 bytes
emit->emitIns_S_R(INS_movsd_simd, EA_8BYTE, dataReg, varNum, offs);

if (data->IsVectorZero())
{
// Store upper 4 bytes
emit->emitIns_S_R(INS_movss, EA_4BYTE, dataReg, varNum, offs + 8);
}
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
// Extract and store upper 4 bytes
emit->emitIns_S_R_I(INS_extractps, EA_16BYTE, varNum, offs + 8, dataReg, 2);
}
else
{
regNumber tmpReg = treeNode->GetSingleTempReg();

// Extract upper 4 bytes from data
emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg);

// Store upper 4 bytes
emit->emitIns_S_R(INS_movss, EA_4BYTE, tmpReg, varNum, offs + 8);
}
genEmitStoreLclTypeSimd12(treeNode, varNum, offs);
}

genUpdateLifeStore(treeNode, tgtReg, varDsc);
Expand All @@ -291,36 +260,82 @@ void CodeGen::genStoreLclTypeSimd12(GenTreeLclVarCommon* treeNode)
void CodeGen::genLoadLclTypeSimd12(GenTreeLclVarCommon* treeNode)
{
assert(treeNode->OperIs(GT_LCL_FLD, GT_LCL_VAR));
genEmitLoadLclTypeSimd12(treeNode->GetRegNum(), treeNode->GetLclNum(), treeNode->GetLclOffs());
genProduceReg(treeNode);
}

emitter* emit = GetEmitter();
//------------------------------------------------------------------------
// genEmitStoreLclTypeSimd12: Emit code to store a SIMD12 value to stack.
//
// Arguments:
// store - The store node
// lclNum - Stack local's number
// offset - Offset to store at
//
void CodeGen::genEmitStoreLclTypeSimd12(GenTree* store, unsigned lclNum, unsigned offset)
{
assert(store->OperIsLocalStore() || store->OperIs(GT_STOREIND));

unsigned offs = treeNode->GetLclOffs();
unsigned varNum = treeNode->GetLclNum();
assert(varNum < compiler->lvaCount);
emitter* emit = GetEmitter();
GenTree* data = store->Data();
regNumber dataReg = data->GetRegNum();

regNumber tgtReg = treeNode->GetRegNum();
// Store lower 8 bytes
emit->emitIns_S_R(INS_movsd_simd, EA_8BYTE, dataReg, lclNum, offset);

if (data->IsVectorZero())
{
// Store upper 4 bytes
emit->emitIns_S_R(INS_movss, EA_4BYTE, dataReg, lclNum, offset + 8);
}
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
// Extract and store upper 4 bytes
emit->emitIns_S_R_I(INS_extractps, EA_16BYTE, lclNum, offset + 8, dataReg, 2);
}
else
{
regNumber tmpReg = store->GetSingleTempReg();

// Extract upper 4 bytes from data
emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg);

// Store upper 4 bytes
emit->emitIns_S_R(INS_movss, EA_4BYTE, tmpReg, lclNum, offset + 8);
}
}

//------------------------------------------------------------------------
// genEmitLoadLclTypeSimd12: Emit code to load a SIMD12 value from stack.
//
// Arguments:
// tgtReg - Register to load into
// lclNum - Stack local's number
// offset - Offset to load from
//
void CodeGen::genEmitLoadLclTypeSimd12(regNumber tgtReg, unsigned lclNum, unsigned offset)
{
emitter* emit = GetEmitter();

if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
// Load lower 8 bytes into tgtReg, preserving upper 4 bytes
emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, varNum, offs);
emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, lclNum, offset);

// Load and insert upper 4 byte, 0x20 inserts to index 2 and 0x8 zeros index 3
emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, varNum, offs + 8, 0x28);
emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, lclNum, offset + 8, 0x28);
}
else
{
// Load upper 4 bytes to lower half of tgtReg
emit->emitIns_R_S(INS_movss, EA_4BYTE, tgtReg, varNum, offs + 8);
emit->emitIns_R_S(INS_movss, EA_4BYTE, tgtReg, lclNum, offset + 8);

// Move upper 4 bytes to upper half of tgtReg
emit->emitIns_R_R(INS_movlhps, EA_16BYTE, tgtReg, tgtReg);

// Load lower 8 bytes into tgtReg, preserving upper 4 bytes
emit->emitIns_R_S(INS_movlps, EA_16BYTE, tgtReg, varNum, offs);
emit->emitIns_R_S(INS_movlps, EA_16BYTE, tgtReg, lclNum, offset);
}

genProduceReg(treeNode);
}

#ifdef TARGET_X86
Expand Down