@@ -3486,6 +3486,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
34863486 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
34873487 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
34883488 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3489+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
34893490 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
34903491 // Don't fold if we are using source or output modifiers. The new VOP2
34913492 // instructions don't have them.
@@ -3506,6 +3507,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35063507 bool IsFMA =
35073508 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35083509 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3510+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35093511 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35103512 MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
35113513 MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3539,16 +3541,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35393541
35403542 unsigned NewOpc =
35413543 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3542- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3544+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3545+ ? AMDGPU::V_FMAMK_F16_t16
3546+ : AMDGPU::V_FMAMK_F16_fake16
35433547 : AMDGPU::V_FMAMK_F16)
35443548 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
35453549 if (pseudoToMCOpcode (NewOpc) == -1 )
35463550 return false ;
35473551
3548- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3549- // would also require restricting their register classes. For now
3550- // just bail out.
3551- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3553+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3554+ // restricting their register classes. For now just bail out.
3555+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3556+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
35523557 return false ;
35533558
35543559 const int64_t Imm = getImmFor (RegSrc == Src1 ? *Src0 : *Src1);
@@ -3563,7 +3568,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35633568 Src0->setIsKill (RegSrc->isKill ());
35643569
35653570 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3566- Opc == AMDGPU::V_FMAC_F32_e64 ||
3571+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35673572 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
35683573 UseMI.untieRegOperand (
35693574 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3618,23 +3623,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36183623
36193624 unsigned NewOpc =
36203625 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3621- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3626+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3627+ ? AMDGPU::V_FMAAK_F16_t16
3628+ : AMDGPU::V_FMAAK_F16_fake16
36223629 : AMDGPU::V_FMAAK_F16)
36233630 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
36243631 if (pseudoToMCOpcode (NewOpc) == -1 )
36253632 return false ;
36263633
3627- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3628- // would also require restricting their register classes. For now
3629- // just bail out.
3630- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3634+ // V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3635+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3636+ // restricting their register classes. For now just bail out.
3637+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3638+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36313639 return false ;
36323640
36333641 // FIXME: This would be a lot easier if we could return a new instruction
36343642 // instead of having to modify in place.
36353643
36363644 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3637- Opc == AMDGPU::V_FMAC_F32_e64 ||
3645+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36383646 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36393647 UseMI.untieRegOperand (
36403648 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3821,8 +3829,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38213829 return AMDGPU::V_FMA_LEGACY_F32_e64;
38223830 case AMDGPU::V_FMAC_F16_e32:
38233831 case AMDGPU::V_FMAC_F16_e64:
3832+ case AMDGPU::V_FMAC_F16_t16_e64:
38243833 case AMDGPU::V_FMAC_F16_fake16_e64:
3825- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3834+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3835+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3836+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
38263837 : AMDGPU::V_FMA_F16_gfx9_e64;
38273838 case AMDGPU::V_FMAC_F32_e32:
38283839 case AMDGPU::V_FMAC_F32_e64:
@@ -3888,19 +3899,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
38883899 return MIB;
38893900 }
38903901
3891- assert (
3892- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3893- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3894- " pre-RA" );
3902+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3903+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3904+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3905+ " present "
3906+ " pre-RA" );
38953907
38963908 // Handle MAC/FMAC.
38973909 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
38983910 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3911+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
38993912 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39003913 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39013914 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39023915 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39033916 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3917+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39043918 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39053919 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39063920 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3915,6 +3929,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39153929 return nullptr ;
39163930 case AMDGPU::V_MAC_F16_e64:
39173931 case AMDGPU::V_FMAC_F16_e64:
3932+ case AMDGPU::V_FMAC_F16_t16_e64:
39183933 case AMDGPU::V_FMAC_F16_fake16_e64:
39193934 case AMDGPU::V_MAC_F32_e64:
39203935 case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4000,8 +4015,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40004015 int64_t Imm;
40014016 if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
40024017 unsigned NewOpc =
4003- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4004- : AMDGPU::V_FMAAK_F16)
4018+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts ()
4019+ ? ST.useRealTrue16Insts ()
4020+ ? AMDGPU::V_FMAAK_F16_t16
4021+ : AMDGPU::V_FMAAK_F16_fake16
4022+ : AMDGPU::V_FMAAK_F16)
40054023 : AMDGPU::V_FMAAK_F32)
40064024 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
40074025 if (pseudoToMCOpcode (NewOpc) != -1 ) {
@@ -4018,11 +4036,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40184036 return MIB;
40194037 }
40204038 }
4021- unsigned NewOpc =
4022- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4023- : AMDGPU::V_FMAMK_F16)
4024- : AMDGPU::V_FMAMK_F32)
4025- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4039+ unsigned NewOpc = IsFMA
4040+ ? (IsF16 ? (ST.hasTrue16BitInsts ()
4041+ ? ST.useRealTrue16Insts ()
4042+ ? AMDGPU::V_FMAMK_F16_t16
4043+ : AMDGPU::V_FMAMK_F16_fake16
4044+ : AMDGPU::V_FMAMK_F16)
4045+ : AMDGPU::V_FMAMK_F32)
4046+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
40264047 if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
40274048 if (pseudoToMCOpcode (NewOpc) != -1 ) {
40284049 MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4468,6 +4489,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
44684489 case AMDGPU::V_MAC_F32_e64:
44694490 case AMDGPU::V_MAC_LEGACY_F32_e64:
44704491 case AMDGPU::V_FMAC_F16_e64:
4492+ case AMDGPU::V_FMAC_F16_t16_e64:
44714493 case AMDGPU::V_FMAC_F16_fake16_e64:
44724494 case AMDGPU::V_FMAC_F32_e64:
44734495 case AMDGPU::V_FMAC_F64_e64:
@@ -5520,7 +5542,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55205542 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55215543 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55225544 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5523- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5545+ case AMDGPU::S_FMAC_F16:
5546+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5547+ : AMDGPU::V_FMAC_F16_fake16_e64;
55245548 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55255549 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55265550 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments