diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index e13147e9461eb..841b820a7ac42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -16,9 +16,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPUIGroupLP.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -899,31 +899,32 @@ bool MFMASmallGemmOpt::applyIGLPStrategy( class MFMAExpInterleaveOpt final : public IGLPStrategy { private: // The count of TRANS SUs involved in the interleaved pipeline - static unsigned TransPipeCount; + unsigned TransPipeCount = 0; // The count of MFMA SUs involved in the interleaved pipeline - static unsigned MFMAPipeCount; + unsigned MFMAPipeCount = 0; // The count of Add SUs involved in the interleaved pipeline - static unsigned AddPipeCount; + unsigned AddPipeCount = 0; // The number of transitive MFMA successors for each TRANS SU - static unsigned MFMAEnablement; + unsigned MFMAEnablement = 0; // The number of transitive TRANS predecessors for each MFMA SU - static unsigned ExpRequirement; + unsigned ExpRequirement = 0; // The count of independent "chains" of MFMA instructions in the pipeline - static unsigned MFMAChains; + unsigned MFMAChains = 0; // The length of each independent "chain" of MFMA instructions - static unsigned MFMAChainLength; + unsigned MFMAChainLength = 0; // Whether or not the pipeline has V_CVT instructions - static bool HasCvt; + bool HasCvt = false; // Whether or not there are instructions between the TRANS instruction and // V_CVT - static bool HasChainBetweenCvt; + bool HasChainBetweenCvt = false; // The first occuring DS_READ which feeds an MFMA chain - static std::optional FirstPipeDSR; + std::optional FirstPipeDSR = std::nullopt; // The MFMAPipe SUs with no MFMA predecessors SmallVector MFMAChainSeeds; // Compute the heuristics for the pipeline, returning whether or not the DAG // is well formatted for the mutation bool analyzeDAG(const SIInstrInfo *TII); + bool AnalysisResult; /// Whether or not the instruction is a transitive predecessor of an MFMA /// instruction @@ -1334,17 +1335,6 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy { } }; -unsigned MFMAExpInterleaveOpt::TransPipeCount = 0; -unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0; -unsigned MFMAExpInterleaveOpt::AddPipeCount = 0; -unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0; -unsigned MFMAExpInterleaveOpt::ExpRequirement = 0; -unsigned MFMAExpInterleaveOpt::MFMAChains = 0; -unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0; -bool MFMAExpInterleaveOpt::HasCvt = false; -bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false; -std::optional MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt; - bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { SmallVector ExpPipeCands; SmallVector MFMAPipeCands; @@ -1367,6 +1357,12 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { auto Opc = SU.getInstr()->getOpcode(); if (TII->isTRANS(Opc)) { // Avoid counting a potential bonus V_EXP which all the MFMA depend on + // FIXME: This heuristic needs improvement/clarification! + // In general, the pipeline seems to look like this: + // fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16 + // (with potential arithmetic between exp and cvt) + // see + // https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378 if (SU.Succs.size() >= 7) continue; for (auto &Succ : SU.Succs) { @@ -1457,6 +1453,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { } } + MFMAChainSeeds.clear(); MFMAChains = 0; for (auto &MFMAPipeSU : MFMAPipeSUs) { if (is_contained(MFMAChainSeeds, MFMAPipeSU)) @@ -1474,8 +1471,9 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { for (auto Pred : MFMAChainSeeds[0]->Preds) { if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) && - Pred.getSUnit()->getInstr()->mayLoad()) + Pred.getSUnit()->getInstr()->mayLoad()) { FirstPipeDSR = Pred.getSUnit()->NodeNum; + } } MFMAChainLength = MFMAPipeCount / MFMAChains; @@ -1527,12 +1525,8 @@ bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, const GCNSubtarget &ST = DAG->MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - if (Phase != AMDGPU::SchedulingPhase::PostRA) - MFMAChainSeeds.clear(); - if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII)) - return false; - - return true; + AnalysisResult = analyzeDAG(TII); + return AnalysisResult; } bool MFMAExpInterleaveOpt::applyIGLPStrategy( @@ -1540,6 +1534,8 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( DenseMap> &SyncedSchedGroups, AMDGPU::SchedulingPhase Phase) { + assert(AnalysisResult && "no or failed DAG analysis"); + bool IsSmallKernelType = MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; bool IsLargeKernelType = @@ -1559,18 +1555,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( unsigned CurrMFMAForTransPosition = 0; auto incrementTransPosition = [&MFMAChain, &PositionInChain, - &CurrMFMAForTransPosition]() { + &CurrMFMAForTransPosition, this]() { CurrMFMAForTransPosition += MFMAEnablement; PositionInChain = (CurrMFMAForTransPosition / MFMAChains); MFMAChain = CurrMFMAForTransPosition % MFMAChains; }; - auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() { + auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() { auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; return (TempMFMAForTrans / MFMAChains); }; - auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() { + auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() { auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; return TempMFMAForTrans % MFMAChains; }; @@ -1580,7 +1576,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( unsigned PositionInChainForMFMA = 0; auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA, - &PositionInChainForMFMA]() { + &PositionInChainForMFMA, this]() { ++CurrMFMAPosition; MFMAChainForMFMA = CurrMFMAPosition % MFMAChains; PositionInChainForMFMA = CurrMFMAPosition / MFMAChains; @@ -2071,22 +2067,16 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { } }; -static unsigned DSWCount = 0; -static unsigned DSWWithPermCount = 0; -static unsigned DSWWithSharedVMEMCount = 0; - bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, AMDGPU::SchedulingPhase Phase) { unsigned MFMACount = 0; unsigned DSRCount = 0; + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; - bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial; - - assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && - DSWWithSharedVMEMCount == 0)) && - "DSWCounters should be zero in pre-RA scheduling!"); SmallVector DSWithPerms; for (auto &SU : DAG->SUnits) { auto *I = SU.getInstr(); @@ -2095,7 +2085,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( else if (TII->isDS(*I)) { if (I->mayLoad()) ++DSRCount; - else if (I->mayStore() && IsInitial) { + else if (I->mayStore()) { ++DSWCount; for (auto Pred : SU.Preds) { if (Pred.getSUnit()->getInstr()->getOpcode() == @@ -2108,58 +2098,54 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( } } - if (IsInitial) { - DSWWithPermCount = DSWithPerms.size(); - auto *I = DSWithPerms.begin(); - auto *E = DSWithPerms.end(); - - // Get the count of DS_WRITES with V_PERM predecessors which - // have loop carried dependencies (WAR) on the same VMEM_READs. - // We consider partial overlap as a miss -- in other words, - // for a given DS_W, we only consider another DS_W as matching - // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred - // for every V_PERM pred of this DS_W. - DenseMap VMEMLookup; - SmallVector Counted; - for (; I != E; I++) { - SUnit *Cand = nullptr; - bool MissedAny = false; - for (auto &Pred : (*I)->Preds) { - if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) - continue; + DSWWithPermCount = DSWithPerms.size(); + + // Get the count of DS_WRITES with V_PERM predecessors which + // have loop carried dependencies (WAR) on the same VMEM_READs. + // We consider partial overlap as a miss -- in other words, + // for a given DS_W, we only consider another DS_W as matching + // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred + // for every V_PERM pred of this DS_W. + DenseMap VMEMLookup; + SmallVector Counted; + for (SUnit *DSWrite : DSWithPerms) { + SUnit *Cand = nullptr; + bool MissedAny = false; + for (auto &Pred : DSWrite->Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) + continue; - if (Cand && llvm::is_contained(Counted, Cand)) - break; + if (Cand && llvm::is_contained(Counted, Cand)) + break; - for (auto &Succ : Pred.getSUnit()->Succs) { - auto *MI = Succ.getSUnit()->getInstr(); - if (!TII->isVMEM(*MI) || !MI->mayLoad()) - continue; + for (auto &Succ : Pred.getSUnit()->Succs) { + auto *MI = Succ.getSUnit()->getInstr(); + if (!TII->isVMEM(*MI) || !MI->mayLoad()) + continue; - if (MissedAny || !VMEMLookup.size()) { - MissedAny = true; - VMEMLookup[MI] = *I; - continue; - } + if (MissedAny || !VMEMLookup.size()) { + MissedAny = true; + VMEMLookup[MI] = DSWrite; + continue; + } - auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I); - if (Inserted) { - MissedAny = true; - continue; - } + auto [It, Inserted] = VMEMLookup.try_emplace(MI, DSWrite); + if (Inserted) { + MissedAny = true; + continue; + } - Cand = It->second; - if (llvm::is_contained(Counted, Cand)) { - MissedAny = true; - break; - } + Cand = It->second; + if (llvm::is_contained(Counted, Cand)) { + MissedAny = true; + break; } } - if (!MissedAny && Cand) { - DSWWithSharedVMEMCount += 2; - Counted.push_back(Cand); - Counted.push_back(*I); - } + } + if (!MissedAny && Cand) { + DSWWithSharedVMEMCount += 2; + Counted.push_back(Cand); + Counted.push_back(DSWrite); } } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll index 9dad8ff365651..4005aacc90ef3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll @@ -5,21 +5,16 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: ds_read_b128 v[2:5], v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112 ; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96 ; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80 ; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64 +; GCN-NEXT: ds_read_b128 v[2:5], v0 ; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16 ; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32 ; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b128 v0, v[2:5] -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112 ; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96 ; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80 @@ -27,6 +22,11 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) { ; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48 ; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32 ; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16 +; GCN-NEXT: ds_write_b128 v0, v[2:5] +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: ds_write_b64 v0, v[2:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index 94de6dd31cad5..729596f541147 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -79,7 +79,6 @@ ; GCN-NEXT: ; implicit-def: $vgpr211 ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 ; GCN-NEXT: ; implicit-def: $vgpr198 - ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 ; GCN-NEXT: ; implicit-def: $vgpr34 @@ -87,6 +86,7 @@ ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GCN-NEXT: ; implicit-def: $vgpr21 ; GCN-NEXT: ; implicit-def: $vgpr22 ; GCN-NEXT: ; implicit-def: $vgpr23 @@ -510,69 +510,70 @@ ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 - ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 - ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 - ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 - ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 - ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_exp_f32_e32 v113, v112 ; GCN-NEXT: v_exp_f32_e32 v114, v138 ; GCN-NEXT: v_exp_f32_e32 v115, v139 ; GCN-NEXT: v_exp_f32_e32 v116, v140 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 + ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 ; GCN-NEXT: v_exp_f32_e32 v117, v141 ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 ; GCN-NEXT: v_exp_f32_e32 v118, v142 + ; GCN-NEXT: v_exp_f32_e32 v119, v143 ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 ; GCN-NEXT: v_exp_f32_e32 v120, v144 - ; GCN-NEXT: v_exp_f32_e32 v113, v112 - ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 - ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 - ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 + ; GCN-NEXT: v_exp_f32_e32 v112, v129 ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 - ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 - ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 - ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 - ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 - ; GCN-NEXT: v_exp_f32_e32 v112, v129 - ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 - ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v119, v143 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e64 v0, v0, v112 + ; GCN-NEXT: v_mul_f32_e64 v1, v1, v112 + ; GCN-NEXT: v_mul_f32_e64 v2, v2, v112 + ; GCN-NEXT: v_mul_f32_e64 v3, v3, v112 + ; GCN-NEXT: v_mul_f32_e64 v4, v4, v112 + ; GCN-NEXT: v_mul_f32_e64 v5, v5, v112 + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 @@ -595,46 +596,50 @@ ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 ; GCN-NEXT: v_exp_f32_e32 v121, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 ; GCN-NEXT: v_exp_f32_e32 v122, v149 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 + ; GCN-NEXT: v_exp_f32_e32 v123, v150 ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_exp_f32_e32 v124, v151 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 - ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 - ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v123, v150 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 + ; GCN-NEXT: v_exp_f32_e32 v96, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 - ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 - ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 - ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 + ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 + ; GCN-NEXT: v_exp_f32_e32 v97, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 + ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v124, v151 ; GCN-NEXT: ds_read_b128 v[130:133], v197 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 - ; GCN-NEXT: v_exp_f32_e32 v96, v129 - ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v97, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_exp_f32_e32 v98, v138 + ; GCN-NEXT: v_exp_f32_e32 v101, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 - ; GCN-NEXT: v_exp_f32_e32 v98, v138 ; GCN-NEXT: v_exp_f32_e32 v99, v127 ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 @@ -665,28 +670,26 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[194:195] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 + ; GCN-NEXT: v_exp_f32_e32 v102, v142 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v101, v125 ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] - ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v102, v142 ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND @@ -694,47 +697,47 @@ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 ; GCN-NEXT: v_exp_f32_e32 v103, v150 - ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 - ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 + ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 + ; GCN-NEXT: v_exp_f32_e32 v105, v125 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 ; GCN-NEXT: v_exp_f32_e32 v104, v129 ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] ; GCN-NEXT: ds_read_b128 v[130:133], v198 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 - ; GCN-NEXT: v_exp_f32_e32 v105, v125 ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 - ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 - ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 - ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] ; GCN-NEXT: v_exp_f32_e32 v106, v156 ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 + ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 + ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 + ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 + ; GCN-NEXT: v_exp_f32_e32 v107, v138 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v107, v138 ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -744,6 +747,10 @@ ; GCN-NEXT: v_exp_f32_e32 v108, v129 ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] + ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 + ; GCN-NEXT: v_exp_f32_e32 v111, v146 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 ; GCN-NEXT: v_exp_f32_e32 v109, v125 @@ -751,32 +758,16 @@ ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 ; GCN-NEXT: v_exp_f32_e32 v110, v156 - ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 - ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v111, v146 ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 + ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] ; GCN-NEXT: v_exp_f32_e32 v80, v129 - ; GCN-NEXT: ds_read_b128 v[130:133], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] - ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 - ; GCN-NEXT: v_exp_f32_e32 v81, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 - ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 @@ -784,13 +775,25 @@ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 - ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] ; GCN-NEXT: v_exp_f32_e32 v83, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 + ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 + ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -803,7 +806,6 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v200, v[150:151] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v201, v[152:153] @@ -811,19 +813,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[154:155] ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 - ; GCN-NEXT: v_exp_f32_e32 v84, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] ; GCN-NEXT: v_exp_f32_e32 v85, v125 ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 + ; GCN-NEXT: v_exp_f32_e32 v84, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 @@ -833,27 +833,28 @@ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 + ; GCN-NEXT: v_exp_f32_e32 v86, v156 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 - ; GCN-NEXT: v_exp_f32_e32 v86, v156 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 - ; GCN-NEXT: v_exp_f32_e32 v87, v157 - ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 - ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 - ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 + ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 + ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 + ; GCN-NEXT: v_exp_f32_e32 v87, v157 ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 ; GCN-NEXT: ds_read_b128 v[130:133], v198 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -870,13 +871,15 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v89, v125 ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 + ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 + ; GCN-NEXT: v_exp_f32_e32 v89, v125 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 - ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 ; GCN-NEXT: v_exp_f32_e32 v90, v158 ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] @@ -898,46 +901,45 @@ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 - ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] ; GCN-NEXT: v_exp_f32_e32 v93, v125 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] ; GCN-NEXT: v_exp_f32_e32 v94, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] + ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 ; GCN-NEXT: v_exp_f32_e32 v95, v127 ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v126, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 ; GCN-NEXT: v_exp_f32_e32 v125, v129 - ; GCN-NEXT: ds_read_b128 v[132:135], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v127, v143 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 - ; GCN-NEXT: v_exp_f32_e32 v126, v142 - ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v127, v143 + ; GCN-NEXT: v_exp_f32_e32 v129, v138 ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] + ; GCN-NEXT: ds_read_b128 v[132:135], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v129, v138 ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -945,127 +947,117 @@ ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v130, v158 + ; GCN-NEXT: v_exp_f32_e32 v131, v144 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b64 v199, v[150:151] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v200, v[152:153] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] - ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 - ; GCN-NEXT: v_exp_f32_e32 v130, v158 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v201, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[156:157] + ; GCN-NEXT: v_exp_f32_e32 v133, v141 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 - ; GCN-NEXT: v_exp_f32_e32 v131, v144 - ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] ; GCN-NEXT: v_exp_f32_e32 v132, v145 - ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 + ; GCN-NEXT: v_exp_f32_e32 v73, v144 ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v133, v141 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 + ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 + ; GCN-NEXT: v_exp_f32_e32 v72, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b128 v[68:71], v198 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v72, v146 - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v73, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 - ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 - ; GCN-NEXT: v_exp_f32_e32 v74, v65 - ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 ; GCN-NEXT: v_exp_f32_e32 v75, v142 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_exp_f32_e32 v74, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 + ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 ; GCN-NEXT: v_exp_f32_e32 v76, v146 ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] ; GCN-NEXT: v_exp_f32_e32 v77, v147 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v142, v146 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 - ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 - ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] + ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 ; GCN-NEXT: v_exp_f32_e32 v78, v67 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 - ; GCN-NEXT: v_exp_f32_e32 v79, v148 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v142, v146 - ; GCN-NEXT: ds_read_b128 v[68:71], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v137, v147 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] ; GCN-NEXT: v_exp_f32_e32 v138, v138 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 + ; GCN-NEXT: v_exp_f32_e32 v79, v148 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] - ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v137, v147 + ; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_exp_f32_e32 v52, v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 - ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 - ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 @@ -1097,14 +1089,21 @@ ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 + ; GCN-NEXT: ds_read_b128 v[68:71], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 + ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 @@ -1118,8 +1117,11 @@ ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] - ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: s_nop 6 ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0a8d7acd187fc..713e8f4c791dd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -221,51 +221,47 @@ ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 ; GCN-NEXT: v_exp_f32_e32 v73, v16 + ; GCN-NEXT: v_exp_f32_e32 v17, v17 ; GCN-NEXT: v_exp_f32_e32 v74, v18 ; GCN-NEXT: v_exp_f32_e32 v75, v19 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 ; GCN-NEXT: v_exp_f32_e32 v81, v21 ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 ; GCN-NEXT: v_exp_f32_e32 v82, v22 ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 + ; GCN-NEXT: v_exp_f32_e32 v80, v20 + ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 + ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 + ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 ; GCN-NEXT: ds_read_b128 v[18:21], v84 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v16, v24 + ; GCN-NEXT: v_exp_f32_e32 v85, v22 + ; GCN-NEXT: v_exp_f32_e32 v88, v23 ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -286,22 +282,24 @@ ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 + ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 + ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 + ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 + ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 ; GCN-NEXT: ds_read_b128 v[26:29], v91 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -326,50 +324,46 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v79, v65 ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 + ; GCN-NEXT: v_exp_f32_e32 v22, v64 + ; GCN-NEXT: v_exp_f32_e32 v64, v65 + ; GCN-NEXT: v_exp_f32_e32 v23, v23 + ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 + ; GCN-NEXT: v_exp_f32_e32 v25, v67 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 + ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_exp_f32_e32 v27, v3 + ; GCN-NEXT: v_exp_f32_e32 v65, v65 + ; GCN-NEXT: v_exp_f32_e32 v68, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b128 v[0:3], v84 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -380,42 +374,59 @@ ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 + ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 + ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 + ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 + ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 + ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 + ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 + ; GCN-NEXT: v_exp_f32_e32 v26, v26 + ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_exp_f32_e32 v6, v6 ; GCN-NEXT: v_exp_f32_e32 v7, v7 ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_exp_f32_e32 v8, v8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 ; GCN-NEXT: v_exp_f32_e32 v10, v0 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 + ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 + ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 + ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 + ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 + ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 + ; GCN-NEXT: v_exp_f32_e32 v12, v3 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 @@ -423,40 +434,26 @@ ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 ; GCN-NEXT: v_exp_f32_e32 v17, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 + ; GCN-NEXT: v_exp_f32_e32 v9, v9 + ; GCN-NEXT: v_exp_f32_e32 v11, v11 ; GCN-NEXT: v_exp_f32_e32 v15, v3 ; GCN-NEXT: v_exp_f32_e32 v18, v1 ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 ; GCN-NEXT: ds_read_b128 v[0:3], v91 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 @@ -482,13 +479,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] + ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index a10c99070d8e1..64265f6f9a505 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -285,6 +285,34 @@ entry: ret void } +; If we run this function after test_iglp_opt_rev_mfma_gemm, we get: +; > Assertion `(!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && +; > DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA +; > scheduling!"' failed. +; This is because, previously, the counters were global static variables which +; weren't reset. +define amdgpu_kernel void @test_after_test_iglp_opt_rev_mfma_gemm(ptr %src, ptr addrspace(3) %dst) { +; GCN-LABEL: test_after_test_iglp_opt_rev_mfma_gemm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: ; iglp_opt mask(0x00000001) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: ds_write_b8 v1, v0 +; GCN-NEXT: s_endpgm +entry: + %a = load i1, ptr %src, align 1 + call void @llvm.amdgcn.iglp.opt(i32 1) + store i1 %a, ptr addrspace(3) %dst, align 1 + ret void +} + define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_asm_sideeffect: ; GCN: ; %bb.0: ; %entry