@@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35443544 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
35453545 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35463546 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3547+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35473548 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
35483549 // Don't fold if we are using source or output modifiers. The new VOP2
35493550 // instructions don't have them.
@@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35643565 bool IsFMA =
35653566 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35663567 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3568+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35673569 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35683570 MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
35693571 MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35973599
35983600 unsigned NewOpc =
35993601 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3600- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3602+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3603+ ? AMDGPU::V_FMAMK_F16_t16
3604+ : AMDGPU::V_FMAMK_F16_fake16
36013605 : AMDGPU::V_FMAMK_F16)
36023606 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
36033607 if (pseudoToMCOpcode (NewOpc) == -1 )
36043608 return false ;
36053609
3606- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3607- // would also require restricting their register classes. For now
3608- // just bail out.
3609- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3610+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3611+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3612+ // restricting their register classes. For now just bail out.
3613+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3614+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
36103615 return false ;
36113616
36123617 const int64_t Imm = getImmFor (RegSrc == Src1 ? *Src0 : *Src1);
@@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36213626 Src0->setIsKill (RegSrc->isKill ());
36223627
36233628 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3624- Opc == AMDGPU::V_FMAC_F32_e64 ||
3629+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36253630 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36263631 UseMI.untieRegOperand (
36273632 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36763681
36773682 unsigned NewOpc =
36783683 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3679- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3684+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3685+ ? AMDGPU::V_FMAAK_F16_t16
3686+ : AMDGPU::V_FMAAK_F16_fake16
36803687 : AMDGPU::V_FMAAK_F16)
36813688 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
36823689 if (pseudoToMCOpcode (NewOpc) == -1 )
36833690 return false ;
36843691
3685- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3686- // would also require restricting their register classes. For now
3687- // just bail out.
3688- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3692+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3693+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3694+ // restricting their register classes. For now just bail out.
3695+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3696+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36893697 return false ;
36903698
36913699 // FIXME: This would be a lot easier if we could return a new instruction
36923700 // instead of having to modify in place.
36933701
36943702 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3695- Opc == AMDGPU::V_FMAC_F32_e64 ||
3703+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36963704 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36973705 UseMI.untieRegOperand (
36983706 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38793887 return AMDGPU::V_FMA_LEGACY_F32_e64;
38803888 case AMDGPU::V_FMAC_F16_e32:
38813889 case AMDGPU::V_FMAC_F16_e64:
3890+ case AMDGPU::V_FMAC_F16_t16_e64:
38823891 case AMDGPU::V_FMAC_F16_fake16_e64:
3883- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3892+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3893+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3894+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
38843895 : AMDGPU::V_FMA_F16_gfx9_e64;
38853896 case AMDGPU::V_FMAC_F32_e32:
38863897 case AMDGPU::V_FMAC_F32_e64:
@@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39463957 return MIB;
39473958 }
39483959
3949- assert (
3950- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3951- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3952- " pre-RA" );
3960+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3961+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3962+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3963+ " present "
3964+ " pre-RA" );
39533965
39543966 // Handle MAC/FMAC.
39553967 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
39563968 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3969+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39573970 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39583971 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39593972 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39603973 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39613974 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3975+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39623976 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39633977 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39643978 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39733987 return nullptr ;
39743988 case AMDGPU::V_MAC_F16_e64:
39753989 case AMDGPU::V_FMAC_F16_e64:
3990+ case AMDGPU::V_FMAC_F16_t16_e64:
39763991 case AMDGPU::V_FMAC_F16_fake16_e64:
39773992 case AMDGPU::V_MAC_F32_e64:
39783993 case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40584073 int64_t Imm;
40594074 if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
40604075 unsigned NewOpc =
4061- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4062- : AMDGPU::V_FMAAK_F16)
4076+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts ()
4077+ ? ST.useRealTrue16Insts ()
4078+ ? AMDGPU::V_FMAAK_F16_t16
4079+ : AMDGPU::V_FMAAK_F16_fake16
4080+ : AMDGPU::V_FMAAK_F16)
40634081 : AMDGPU::V_FMAAK_F32)
40644082 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
40654083 if (pseudoToMCOpcode (NewOpc) != -1 ) {
@@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40764094 return MIB;
40774095 }
40784096 }
4079- unsigned NewOpc =
4080- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4081- : AMDGPU::V_FMAMK_F16)
4082- : AMDGPU::V_FMAMK_F32)
4083- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4097+ unsigned NewOpc = IsFMA
4098+ ? (IsF16 ? (ST.hasTrue16BitInsts ()
4099+ ? ST.useRealTrue16Insts ()
4100+ ? AMDGPU::V_FMAMK_F16_t16
4101+ : AMDGPU::V_FMAMK_F16_fake16
4102+ : AMDGPU::V_FMAMK_F16)
4103+ : AMDGPU::V_FMAMK_F32)
4104+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
40844105 if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
40854106 if (pseudoToMCOpcode (NewOpc) != -1 ) {
40864107 MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
45264547 case AMDGPU::V_MAC_F32_e64:
45274548 case AMDGPU::V_MAC_LEGACY_F32_e64:
45284549 case AMDGPU::V_FMAC_F16_e64:
4550+ case AMDGPU::V_FMAC_F16_t16_e64:
45294551 case AMDGPU::V_FMAC_F16_fake16_e64:
45304552 case AMDGPU::V_FMAC_F32_e64:
45314553 case AMDGPU::V_FMAC_F64_e64:
@@ -5582,7 +5604,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55825604 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55835605 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55845606 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5585- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5607+ case AMDGPU::S_FMAC_F16:
5608+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5609+ : AMDGPU::V_FMAC_F16_fake16_e64;
55865610 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55875611 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55885612 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments