diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml new file mode 100644 index 00000000000..c52fe8123b1 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml @@ -0,0 +1,1001 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 64 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: 'Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 + LdsInitCVgprs: false + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: true + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 64 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: 'Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 + LdsInitCVgprs: false + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: true + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 64 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: 'Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 + LdsInitCVgprs: false + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: true + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [4096, 4096, 1, 8192] + - [2, 0] + - - [4096, 4096, 1, 16384] + - [2, 0] + - - [8192, 8192, 1, 8192] + - [0, 0] + - - [8192, 8192, 1, 16384] + - [1, 0] + - - [32000, 8192, 1, 8192] + - [1, 0] +- null +- null +- DeviceEfficiency +- Equality \ No newline at end of file diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_HHS_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_HHS_BH_UserArgs.yaml index 88c397e4b9a..370cdf5edaa 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_HHS_BH_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_HHS_BH_UserArgs.yaml @@ -1,7 +1,7 @@ - {MinimumRequiredVersion: 4.33.0} - gfx950 - gfx950 -- [Device 0049, Device 0050] +- [Device 75a0] - Activation: false ActivationComputeDataType: 0 ActivationNoGuard: false @@ -10,7 +10,7 @@ AssignedDerivedParameters: true Batched: true BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] + BiasDataTypeList: [] BiasSrc: D ComplexConjugateA: false ComplexConjugateB: false @@ -58,6 +58,8 @@ StochasticRounding: false StridedBatched: true SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false TLUA: false TLUB: false Tensor0: 0 @@ -76,7 +78,7 @@ UseScaleAB: '' UseScaleAlphaVec: 0 UseScaleCD: false -- - 1LDSBuffer: 1 +- - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -84,28 +86,32 @@ AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertSummationElementMultiple: 64 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: '4' ConvertAfterDS: false - CustomKernelName: '' + CustomKernelName: 'Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950' DebugStreamK: 0 DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 @@ -114,7 +120,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -126,7 +132,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_UserArgs_MT256x256x64_MI32x32x1_SN_K1_MIWT4_4 + KernelNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -135,37 +142,38 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -173,39 +181,42 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 256 MacroTile1: 256 MacroTileA: 256 MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 MaxOccupancy: 40 - MaxVgprNumber: 256 - MinVgprNumber: 0 - NoLdsWriteCode: false + MbskPrefetchOpt: 0 + NoLdsWriteCode: true NoReject: false - NoTailLoop: false + NoTailLoop: true + NonDTLTailLoopA: false + NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -213,6 +224,7 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -220,7 +232,7 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 0 + PreloadKernArgs: true ProblemType: Activation: false ActivationComputeDataType: 0 @@ -230,7 +242,7 @@ AssignedDerivedParameters: true Batched: true BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] + BiasDataTypeList: [] BiasSrc: D ComplexConjugateA: false ComplexConjugateB: false @@ -278,6 +290,8 @@ StochasticRounding: false StridedBatched: true SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false TLUA: false TLUB: false Tensor0: 0 @@ -300,29 +314,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_UserArgs_MT256x256x64_MI32x32x1_SN_GSU1_GSUC0_GSUWGMRR0_K1_MIWT4_4_SU0_SUM0_SUS0_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TotalVgprNumber: 512 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -331,18 +345,21 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 8 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -352,14 +369,328 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 64 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: 'Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 + LdsInitCVgprs: false + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: true + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] -- - - [128, 128, 1, 128, 128, 128, 128, 128] - - [0, 0.000604037] +- - - [4096, 4096, 1, 8192] + - [1, 0] + - - [4096, 4096, 1, 16384] + - [1, 0] + - - [8192, 8192, 1, 8192] + - [0, 0] + - - [8192, 8192, 1, 16384] + - [0, 0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.s new file mode 100644 index 00000000000..547037f592d --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.s @@ -0,0 +1,19059 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx950" +.text +.protected Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 +.globl Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 +.p2align 8 +.type Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_accum_offset 248 // accvgpr offset + .amdhsa_next_free_vgpr 504 // vgprs + .amdhsa_next_free_sgpr 88 // sgprs + .amdhsa_group_segment_fixed_size 133120 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_user_sgpr_count 13 + .amdhsa_user_sgpr_kernarg_preload_length 11 + .amdhsa_user_sgpr_kernarg_preload_offset 0 +.end_amdhsa_kernel +.text +/* Num VGPR =248 */ +/* Num AccVGPR=256 */ +/* Num SGPR =88 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 32 x 8 */ +/* SubGroup= 8 x 32 */ +/* VectorWidthA=8 */ +/* VectorWidthB=8 */ +/* GlobalReadVectorWidthA=8, GlobalReadVectorWidthB=8 */ +/* DirectToLdsA=True */ +/* DirectToLdsB=True */ +/* UseSgprForGRO=1 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 + ProblemType: + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: False +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + .symbol: 'Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + .group_segment_fixed_size: 133120 + .kernarg_segment_align: 8 + .kernarg_segment_size: 104 + .max_flat_workgroup_size: 256 + .private_segment_fixed_size: 0 + .sgpr_count: 88 + .sgpr_spill_count: 0 + .vgpr_count: 248 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950: +label_ASM_Start: /// Main body of the asm kernel +.macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req + v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber + v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA + v_add_u32 v[\vgprDstIdx+0], v[\vgprDstIdx+0], v[\vgprDstIdx+1] + v_lshrrev_b32 v[\vgprDstIdx+0], \magicShift, v[\vgprDstIdx+0] +.endm + +/******************************************/ +/* VGPR Assignments */ +/******************************************/ +/* ValuC range: [0-0), serializedStore enabled */ +.set vgprValuC, 0 +/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ +.set vgprBase, 4 +.set vgprGlobalReadOffsetA, 0 +.set vgprGlobalReadOffsetB, 1 +.set vgprLocalReadAddrA, 2 +.set vgprLocalReadAddrB, 3 +.set vgprLocalReadSwapAddrA, 132 +.set vgprLocalReadSwapAddrB, 133 +.set vgprSerial, 134 + +/******************************************/ +/* VGPR Macro Assignments */ +/******************************************/ +.set vgprValuA_X0_I0_BASE, vgprBase+0 +.set vgprValuB_X0_I0_BASE, vgprBase+64 +.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0 +.set vgprValuA_X1_I0, vgprValuA_X0_I0_BASE+32 +.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0 +.set vgprValuB_X1_I0, vgprValuB_X0_I0_BASE+32 + +/******************************************/ +/* SGPR Assignments */ +/******************************************/ +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprWGM, 11 +.set sgprLoopCounterL, 12 +.set sgprOrigLoopCounter, 13 +.set sgprSrdD, 16 +.set sgprSrdC, 20 +.set sgprNumWorkGroups0, 14 +.set sgprNumWorkGroups1, 15 +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprLocalWriteAddrA, 46 +.set sgprLocalWriteAddrB, 47 +.set sgprSwapA, 48 +.set sgprSwapB, 49 +.set sgprGSU, 50 + +/* Size Assignments */ +.set sgprSizeI, sgprSizesFree+0 +.set sgprSizeJ, sgprSizesFree+1 +.set sgprSizeK, sgprSizesFree+2 +.set sgprSizeL, sgprSizesSum+0 + +/* Stride Assignments */ +.set constStrideD0I, 1 +.set sgprStrideD1J, sgprStridesD+0 +.set sgprStrideDK, sgprStridesD+1 +.set constStrideC0I, 1 +.set sgprStrideC1J, sgprStridesC+0 +.set sgprStrideCK, sgprStridesC+1 +.set constStrideAL, 1 +.set sgprStrideA0I, sgprStridesA+0 +.set sgprStrideAK, sgprStridesA+1 +.set constStrideBL, 1 +.set sgprStrideB1J, sgprStridesB+0 +.set sgprStrideBK, sgprStridesB+1 + +.set MT0, 256 +.set MT1, 256 +.set DepthU, 64 +.set BpeA, 2 +.set BpeALog2, 1 +.set BpeB, 2 +.set BpeBLog2, 1 +.set BpeAGR, 2 +.set BpeAGRLog2, 1 +.set BpeBGR, 2 +.set BpeBGRLog2, 1 +/* Number of elements to shift-left SRD */ +.set SrdShiftLeftA, 8 +.set SrdShiftLeftB, 8 +/* 2GB limit - set offsets to -1 to exceed this and clamp */ +.set BufferLimit, 0xffffffff +.set BufferOOB, 0x80000000 + +/******************************************/ +/* Bits 127:96 of SRD. */ +/* hex: 0x20000 */ +/* dst_sel_x (3b): 0 */ +/* dst_sel_y (3b): 0 */ +/* dst_sel_z (3b): 0 */ +/* dst_sel_w (3b): 0 */ +/* num_format (3b): 0 */ +/* data_format (4b): 4 */ +/* user_vm_enable (1b): 0 */ +/* user_vm_mode (1b): 0 */ +/* index_stride (2b): 0 */ +/* add_tid_enable (1b): 0 */ +/* _unusedA (3b): 0 */ +/* nv (1b): 0 */ +/* _unusedB (2b): 0 */ +/* type (2b): 0 */ +/******************************************/ +.set Srd127_96, 0x20000 + +/* Global Offset A */ +.macro GLOBAL_OFFSET_A vgprAddr:req, vgprOffsetL:req, vgprOffset0I:req, vgprTmp:req + v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower + v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower + v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0] // add prepad for pointer shift + v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0] // offset *= bytes/element +.endm + +/* Global Offset B */ +.macro GLOBAL_OFFSET_B vgprAddr:req, vgprOffsetL:req, vgprOffset1J:req, vgprTmp:req + v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower + v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower + v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0] // add prepad for pointer shift + v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0] // offset *= bytes/element +.endm + +/******************************************/ +/* Allocate Resources */ +/******************************************/ + +/* Load num of Gemms */ +s_load_dword s51, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 + +/* Load packed kernel args (StaggerU/GSU) */ +s_load_dword s53, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 + +/* Load num of WGs */ +s_load_dword s54, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_waitcnt lgkmcnt(0) // load args +s_lshr_b32 s52, s51, 0x1e // Get arg type +s_and_b32 s51, 0x3fffffff, s51 // Get nums of gemm +s_cmp_eq_u32 s52, 0 // Is kernel args +s_cbranch_scc0 label_HBMArgs +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_waitcnt lgkmcnt(0) // preload +s_branch label_LoadArgsEnd +label_HBMArgs: + +/* Load address of kernel arguments */ +s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 +s_waitcnt lgkmcnt(0) // wait for args to load +label_LoadArgsEnd: +s_branch label_common_kernel_entry + +/* pad 37 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +label_Preload_Offset_Start: +s_and_b32 s51, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s52, s2, 0x1e // Get arg type +s_mov_b32 s53, s3 // Preload internal args +s_cmp_eq_u32 s52, 0 // Is kernel args +s_cbranch_scc0 label_Preload_HBMArgs +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dword s31, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 +s_load_dwordx8 s[32:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_mov_b64 s[24:25], s[6:7] // move preload data to correct sgpr +s_mov_b64 s[26:27], s[8:9] // move preload data to correct sgpr +s_mov_b64 s[28:29], s[10:11] // move preload data to correct sgpr +s_mov_b32 s30, s12 // move preload data to correct sgpr +s_branch label_Preload_LoadArgsEnd +label_Preload_HBMArgs: +s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments +label_Preload_LoadArgsEnd: +s_mov_b32 s[sgprWGM], s4 // Preload internal args2 +s_mov_b32 s54, s5 // Load num of WGs +label_common_kernel_entry: /// for both preload/non-preload common code +s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id +s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id +s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id +s_and_b32 s[sgprStaggerU], s53, 0xffff0000 // Restore StaggerU related vars +s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 +s_and_b32 s[sgprGSU], s53, 0xffff // Restore GSUConfig and GSU +s_mov_b32 s[sgprArgType], s52 +s_mov_b32 m0, 0x20800 // LDS clamp at 133120 bytes +v_mov_b32 v[vgprSerial], v0 // thread serial id + +/* remap workgroup to XCCs */ +s_lshr_b32 s60, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s60, s60 // Get log(WGMXCC) +s_lshr_b32 s61, s[sgprWGM], 0x16 // Get CU_Count +/* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ +s_cmp_gt_i32 s60, 0 +s_cbranch_scc0 label_skip_WGMXCC +/* only remap WGs in the range */ +s_lshr_b32 s57, s54, s60 +s_lshl_b32 s57, s57, s60 +s_cmp_ge_u32 s[sgprWorkGroup0], s57 +s_cbranch_scc1 label_skip_WGMXCC +s_cmp_eq_u32 s61, 0 // CU_Count == 0 ? +s_cbranch_scc0 label_XCCG_nonzero +s_lshr_b32 s57, s[sgprWorkGroup0], s60 +s_bfm_b32 s58, s60, 0 +s_and_b32 s58, s[sgprWorkGroup0], s58 +s_lshr_b32 s59, s54, s60 +s_mul_i32 s58, s58, s59 +s_add_u32 s[sgprWorkGroup0], s57, s58 +s_branch label_skip_WGMXCC +label_XCCG_nonzero: +/* temp0 = (wg//CU_Count)*CU_Count */ +v_cvt_f32_u32 v10, s61 // wg//CU_Count +v_rcp_iflag_f32 v10, v10 // wg//CU_Count +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // wg//CU_Count +v_mul_f32 v10, v10, v11 // wg//CU_Count +v_cvt_u32_f32 v10, v10 // wg//CU_Count +v_mul_u32_u24 v11, v10, s61 // wg//CU_Count +v_sub_u32 v11, s[sgprWorkGroup0], v11 // wg//CU_Count +v_cmpx_eq_u32 exec, v11, s61 // wg//CU_Count +v_add_u32 v10, 1, v10 // wg//CU_Count +v_mov_b32 v11, 0 // wg//CU_Count +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s61 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s61 // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup0], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s57, v10 // quotient +v_readfirstlane_b32 s58, v11 // remainder +s_mul_i32 s57, s57, s61 +/* temp1 = (wg%CU_Count)//WGMXCC */ +s_lshr_b32 s58, s58, s60 +/* temp0 = temp0 + temp1 */ +s_add_u32 s57, s57, s58 +/* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ +v_cvt_f32_u32 v10, s61 // WGs//CU_Count +v_rcp_iflag_f32 v10, v10 // WGs//CU_Count +v_cvt_f32_u32 v11, s54 // WGs//CU_Count +v_mul_f32 v10, v10, v11 // WGs//CU_Count +v_cvt_u32_f32 v10, v10 // WGs//CU_Count +v_mul_u32_u24 v11, v10, s61 // WGs//CU_Count +v_sub_u32 v11, s54, v11 // WGs//CU_Count +v_cmpx_eq_u32 exec, v11, s61 // WGs//CU_Count +v_add_u32 v10, 1, v10 // WGs//CU_Count +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s61 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s58, v10 // quotient +s_mul_i32 s58, s58, s61 +s_sub_u32 s59, s54, s58 +s_cmp_gt_u32 s[sgprWorkGroup0], s58 +s_cselect_b32 s58, s59, s61 +s_lshr_b32 s58, s58, s60 +s_bfm_b32 s59, s60, 0 +s_and_b32 s59, s[sgprWorkGroup0], s59 +s_mul_i32 s58, s58, s59 +/* WorkGroup0 = temp0 + temp1 */ +s_add_u32 s[sgprWorkGroup0], s57, s58 +label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap +//s_mov_b32 s[sgprWorkGroup0], 0 + +/* init: add vgpr [4...136) to pool */ +/* init: add vgpr [0...0) to pool */ +/* init: add agpr [0...256) to pool */ + +/******************************************/ +/* Local Read Addresses */ +/******************************************/ + +/* local read addresses: tile assignments a/b */ +/* lr0I */ +v_and_b32 v5, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v4, 15, v5 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v4, 6, v4 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v4, 3, v4 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v5, 4, v5 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v4, v5, 3, v4 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v8, 6, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64) +v_and_b32 v8, 1, v8 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v4, v8, 13, v4 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset +/* lr1J */ +v_and_b32 v6, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v5, 15, v6 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v5, 6, v5 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v5, 3, v5 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v6, 4, v6 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v5, v6, 3, v5 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v7, 7, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128) +v_and_b32 v7, 1, v7 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v5, v7, 13, v5 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset + +/* local read addresses: final offsets a */ +v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 +v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id +s_mov_b32 s53, 64 // LSU offset: stride = lsuStride(64) when umlds==True +v_mul_lo_u32 v6, s53, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1 // Final Offset: offset = (lro0+lsuoffset)*bpeDS +v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 + +/* local read addresses: final offsets b */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id + // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.) +v_mul_lo_u32 v4, s53, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1 // Final Offset: offset = (lro1+lsuoffset)*bpeDS +v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 + +/* local read addresses: declare addresses a */ +/* N/A */ + +/* local read addresses: declare addresses b */ +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8200, v[vgprLocalReadAddrB+0] // += LdsOffsetB (lower) +v_add_u32 v[vgprLocalReadSwapAddrA], 66560, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping +v_add_u32 v[vgprLocalReadSwapAddrB], 66560, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping + +/******************************************/ +/* Local Write Addresses */ +/******************************************/ +/* LVCA = 8 */ +/* v5 = A-unroll = serial%LVCA */ +v_lshrrev_b32 v4, 3, v[vgprSerial] // 4 = Serial / 8 +v_and_b32 v5, 7, v[vgprSerial] // 5 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v5, 3, v5 // v5 = v5 * 8 +v_mov_b32 v8, v5 // copy for GlobalSplitU +/* LVCB = 8 */ +/* v7 = B-unroll = serial%LVCB */ +v_lshrrev_b32 v6, 3, v[vgprSerial] // 6 = Serial / 8 +v_and_b32 v7, 7, v[vgprSerial] // 7 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v7, 3, v7 // v7 = v7 * 8 +v_mov_b32 v9, v7 // copy for GlobalSplitU +/* lwaUnrollAssignmentA = v8 */ +/* lwaUnrollAssignmentB = v9 */ + +/* local write addresses: first offset a */ +v_mul_u32_u24 v10, 0x40, v4 // lwAL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v8, v10, 0x1 // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrA], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping + +/* local write addresses: first offset b */ +v_mul_u32_u24 v10, 0x40, v6 // lwBL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v9, v10, 0x1 // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +v_add_co_u32 v10, vcc, 0x8200, v10 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=33280 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrB], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping +v_mov_b32 v12, MT0 // set MT0 into sgpr +v_mov_b32 v11, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +v_mov_b32 v12, MT1 // set MT1 into sgpr +v_mov_b32 v11, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v10 // set back to numWorkGroup0 +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +s_nop 0 // 1 wait states +v_readfirstlane_b32 s[sgprNumWorkGroups1], v10 // set back to numWorkGroup1 +s_waitcnt lgkmcnt(0) // wait for 44/0 bytes of kern args + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s52, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s53, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s52, s52, s53 +v_cvt_f32_u32 v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s52 +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_mul_u32_u24 v11, v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s52 +v_cmpx_eq_u32 exec, v11, s52 // s52 = s[sgprWorkGroup0] / s52 +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s52 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s52 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup2], s52 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s52, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s52, s52, s[sgprWorkGroup2] +s_mul_i32 s52, s52, s53 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup1], s52 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s52, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 + +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 62 +.set sgprStaggerUIter, 51 +.set sgprWrapUA, 64 +.set sgprWrapUB, 66 +.set sgprGlobalReadIncsA, 68 +.set sgprGlobalReadIncsB, 69 +.set sgprScalarGlobalReadOffsetA, 70 +.set sgprScalarGlobalReadOffsetB, 77 +s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift +s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift +s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift +s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift + +/* Short circuit condition if Alpha == 0, then sumDims=0 */ +v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? +s_cbranch_vccz label_AlphaNonZero // branch if s[Alpha] != 0 +s_mov_b32 s[sgprSizesSum+0], 0 // Set summation dim=0 if Alpha == 0 +label_AlphaNonZero: + +/******************************************/ +/* Begin setupNewTile */ +/******************************************/ + +/* global read addresses: work-group */ +/* graWorkGroup mapping */ +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU // branch if GSU == 1 +// GSU-not-WGMapRR :nwg1 = (size1J + MT1J - 1) / MT1J; +s_and_b32 s84, s[sgprGSU], 0x4000 // SCC = (GSUWGMRR == 1) ? +s_cbranch_scc1 label_GSUWGMRR // branch if GSUWGMRR == 1 +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_sub_u32 v11, s[sgprWorkGroup1], v11 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mov_b32 v11, 0 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup1], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v10 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx], v11 // remainder +s_branch label_GSUWGMRR_End +label_GSUWGMRR: +v_cvt_f32_u32 v10, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_rcp_iflag_f32 v10, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_f32 v10, v10, v11 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_u32_f32 v10, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_sub_u32 v11, s[sgprWorkGroup1], v11 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_add_u32 v10, 1, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mov_b32 v11, 0 // s[sgprWorkGroup1] = s[sgprWorkGroup1] % s[sgprNumWorkGroups1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups1] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1] // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup1], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprGSUSumIdx], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v11 // remainder +label_GSUWGMRR_End: +s_mov_b32 s[sgprGSULog2BpeC], 1 +s_mov_b32 s[sgprGSULog2BpeD], 2 +s_branch label_GSU_End +label_GSU: +s_mov_b64 s[sgprGSUSumIdx:sgprGSUSumIdx+1], 0 // Set GSUSumIdx to 0 +s_mov_b32 s[sgprGSULog2BpeC], 1 +s_mov_b32 s[sgprGSULog2BpeD], 1 +label_GSU_End: +s_sext_i32_i16 s[sgprWGM], s[sgprWGM] // Restore WGM +s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? +s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 +s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? +s_cbranch_scc1 label_WGM // branch if WGM >= 0 +s_abs_i32 s[sgprWGM], s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprWorkGroup0], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s86, v10 // quotient +s_mul_i32 s87, s86, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s87, s[sgprWorkGroup0], s87 // WorkGroup0=remainder +s_mul_i32 s87, s87, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s87, s87, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprNumWorkGroups0] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprNumWorkGroups0], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s84, v10 // quotient +s_mul_i32 s85, s[sgprWGM], s84 // quotient * non-magic divisor +s_sub_u32 s85, s[sgprNumWorkGroups0], s85 // NumWorkGroups0=remainder +s_cmp_eq_u32 s85, 0 // remainder == 0 ? +s_cmov_b32 s85, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s86, s84 // blockId >= numFullBlocks ? +s_cselect_b32 s84, s85, s[sgprWGM] +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup1] = s87 / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup1] = s87 / s84 +v_cvt_f32_u32 v11, s87 // s[sgprWorkGroup1] = s87 / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup1] = s87 / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup1] = s87 / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup1] = s87 / s84 +v_sub_u32 v11, s87, v11 // s[sgprWorkGroup1] = s87 / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup1] = s87 / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup1] = s87 / s84 +v_mov_b32 v11, 0 // s[sgprWorkGroup0] = s87 % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s87, v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v11 // remainder +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s84 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s87, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s86, s86, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s86 // wg1 += blockId * WGM +s_branch label_WGM +label_WGMPositive: +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprWorkGroup1], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s86, v10 // quotient +s_mul_i32 s87, s86, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s87, s[sgprWorkGroup1], s87 // WorkGroup1=remainder +s_mul_i32 s87, s87, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s87, s87, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprNumWorkGroups1], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s84, v10 // quotient +s_mul_i32 s85, s[sgprWGM], s84 // quotient * non-magic divisor +s_sub_u32 s85, s[sgprNumWorkGroups1], s85 // NumWorkGroups1=remainder +s_cmp_eq_u32 s85, 0 // remainder == 0 ? +s_cmov_b32 s85, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s86, s84 // blockId >= numFullBlocks ? +s_cselect_b32 s84, s85, s[sgprWGM] +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup0] = s87 / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup0] = s87 / s84 +v_cvt_f32_u32 v11, s87 // s[sgprWorkGroup0] = s87 / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup0] = s87 / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup0] = s87 / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup0] = s87 / s84 +v_sub_u32 v11, s87, v11 // s[sgprWorkGroup0] = s87 / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup0] = s87 / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup0] = s87 / s84 +v_mov_b32 v11, 0 // s[sgprWorkGroup1] = s87 % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s87, v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup0], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v11 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s84 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s87, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s86, s86, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s86 // wg1 += blockId * WGM +label_WGM: + +/* global read addresses: tile offset assignment a */ +/* graTileAssignmentA = v4 */ + +/* global read addresses: tile offset assignment b */ +/* graTileAssignmentB = v6 */ + +/* global read addresses: unroll assignment a */ +/* v5 */ + +/* global read addresses: unroll assignment b */ +/* v7 */ + +/* global read addresses: other free assignments */ +/* s[sgprWorkGroup2] */ + +/* global read addresses: tile offsets a */ + +/* global read addresses: tile offsets b */ + +/* global read addresses: unroll offsets a */ + +/* global read addresses: unroll offsets b */ + +/* global read addresses: final offsets a */ +GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0, 5, 4, 10 // gROA_0_0_0_0 +s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 32 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+0], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 64 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+1], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 96 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+2], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], s[sgprStrideA0I], 128 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+3], s[sgprScalarGlobalReadOffsetA+3], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+4], s[sgprStrideA0I], 160 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+4], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+5], s[sgprStrideA0I], 192 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+5], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+6], s[sgprStrideA0I], 224 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+6], 0x1 // scalar offset *= bytes/element + +/* global read addresses: final offsets b */ +GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0, 7, 6, 10 // gROB_0_0_0_0 +s_mul_i32 s[sgprScalarGlobalReadOffsetB+0], s[sgprStrideB1J], 32 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+0], s[sgprScalarGlobalReadOffsetB+0], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+1], s[sgprStrideB1J], 64 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+1], s[sgprScalarGlobalReadOffsetB+1], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+2], s[sgprStrideB1J], 96 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+2], s[sgprScalarGlobalReadOffsetB+2], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+3], s[sgprStrideB1J], 128 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+3], s[sgprScalarGlobalReadOffsetB+3], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+4], s[sgprStrideB1J], 160 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+4], s[sgprScalarGlobalReadOffsetB+4], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+5], s[sgprScalarGlobalReadOffsetB+5], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+6], s[sgprScalarGlobalReadOffsetB+6], 0x1 // scalar offset *= bytes/element + +/* global read addresses: addresses a */ +/* max read offset = size[n] * stride[n-1] */ +s_mul_hi_u32 s87, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s86, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s87, s86, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s86, s86, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_A // branch if GSUC == 1 +s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s84, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_A_End +label_GSUC_A: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s84, 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s85, s85, s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s84, s84, s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s84, s84, s85 // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s85, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s84, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_A_End: +s_add_u32 s86, s86, s84 // accum GsuOffset term to tilestart +s_addc_u32 s87, s87, s85 // accum GsuOffset term to tilestart +s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size +s_sub_u32 s84, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s85, constStrideAL, s84 // stride x (size-1) +s_mul_i32 s84, constStrideAL, s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size +s_sub_u32 s84, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s85, s[sgprStrideA0I], s84 // stride x (size-1) +s_mul_i32 s84, s[sgprStrideA0I], s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s86 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s87 // sub tileStart +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x1 // Set limit to use bytes +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s85, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s84, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s86, s86, s84 // accum wg term to tilestart +s_addc_u32 s87, s87, s85 // accum wg term to tilestart +s_lshl_b64 s[86:87], s[86:87], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s86 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s87 // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +/* global read addresses: addresses b */ +/* max read offset = size[n] * stride[n-1] */ +s_mul_hi_u32 s87, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s86, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s87, s86, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s86, s86, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_B // branch if GSUC == 1 +s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s84, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_B_End +label_GSUC_B: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s84, 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s85, s85, s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s84, s84, s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s84, s84, s85 // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s85, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s84, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_B_End: +s_add_u32 s86, s86, s84 // accum GsuOffset term to tilestart +s_addc_u32 s87, s87, s85 // accum GsuOffset term to tilestart +s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size +s_sub_u32 s84, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s85, constStrideBL, s84 // stride x (size-1) +s_mul_i32 s84, constStrideBL, s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size +s_sub_u32 s84, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s85, s[sgprStrideB1J], s84 // stride x (size-1) +s_mul_i32 s84, s[sgprStrideB1J], s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s86 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s87 // sub tileStart +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x1 // Set limit to use bytes +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s85, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s84, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s86, s86, s84 // accum wg term to tilestart +s_addc_u32 s87, s87, s85 // accum wg term to tilestart +s_lshl_b64 s[86:87], s[86:87], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s86 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s87 // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +/* global read addresses: increments a */ +s_and_b32 s85, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s85, s85, DepthU*BpeAGR // GSU*DepthU*Bpe +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR, s85 // incrA (unrollIdx) + +/* global read addresses: increments b */ +s_and_b32 s85, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s85, s85, DepthU*BpeBGR // GSU*DepthU*Bpe +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR, s85 // incrB (unrollIdx) +/* declare loop num iterations */ +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 6 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 64 +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_1 // branch if GSU == 1 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_add_u32 s84, 1, s[sgprLoopCounterL] // tmp<-numIterMyWg+1 +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cmov_b32 s[sgprLoopCounterL], s84 // numIterMyWg++ if needed +label_GSU_1: +s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter +s_and_b32 s86, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s86, s86, 0x8 +s_and_b32 s87, s[sgprStaggerU], 0xe000 +s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff +s_mov_b32 s84, s[sgprStaggerU] // init staggerU +label_beginStaggerUIter: +s_lshl_b32 s85, s84, s86 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s85 // loopCount >= current shift Count +s_cbranch_scc1 label_endStaggerUIter // jump to end +s_lshr_b32 s84, s84, 1 // step down to smaller stagger +s_branch label_beginStaggerUIter // jump to begin +label_endStaggerUIter: +s_sub_u32 s85, s84, 1 // staggerU mask +s_cmp_ge_u32 s84, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s85, 0 // set Mask +s_cmp_eq_u32 s87, 0x0 +s_cbranch_scc1 label_StaggerUMapping_1 +s_mov_b32 s84, s[sgprWorkGroup0] +s_branch label_staggerInputEnd +label_StaggerUMapping_1: +s_cmp_eq_u32 s87, 0x2000 +s_cbranch_scc1 label_StaggerUMapping_2 +s_mov_b32 s84, s[sgprWorkGroup1] +s_branch label_staggerInputEnd +label_StaggerUMapping_2: +s_cmp_eq_u32 s87, 0x4000 +s_cbranch_scc1 label_StaggerUMapping_3 +s_mov_b32 s84, -0x1 +s_branch label_staggerInputEnd +label_StaggerUMapping_3: +s_cmp_eq_u32 s87, 0x6000 +s_cbranch_scc1 label_StaggerUMapping_4 +s_mul_i32 s85, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s84, s84, s85 +s_add_u32 s84, s84, s[sgprWorkGroup0] +s_branch label_staggerInputEnd +label_StaggerUMapping_4: +s_cmp_eq_u32 s87, 0x8000 +s_cbranch_scc1 label_staggerInputEnd +s_mov_b32 s84, -0x1 +s_branch label_staggerInputEnd +label_staggerInputEnd: +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s84 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s86 // shift by StaggerUStride + +/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ +s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop +s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop +s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration +s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ +s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop +s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop +s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration +s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap +/* local read addresses: init pointers a */ + +/* localReadInitPointers */ +/* local read addresses: init pointers b */ + +/* localReadInitPointers */ + +/* prefetch: global -> local */ +s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? +s_cbranch_scc1 label_ShadowInitStart // skip to ShadowInitStart iter b/c numIter==0 + +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +// Interleave Init C +v_accvgpr_write acc0, 0 // initC +v_accvgpr_write acc1, 0 // initC +v_accvgpr_write acc2, 0 // initC +v_accvgpr_write acc3, 0 // initC +v_accvgpr_write acc4, 0 // initC +v_accvgpr_write acc5, 0 // initC +v_accvgpr_write acc6, 0 // initC +v_accvgpr_write acc7, 0 // initC +v_accvgpr_write acc8, 0 // initC +v_accvgpr_write acc9, 0 // initC +v_accvgpr_write acc10, 0 // initC +v_accvgpr_write acc11, 0 // initC +v_accvgpr_write acc12, 0 // initC +v_accvgpr_write acc13, 0 // initC +v_accvgpr_write acc14, 0 // initC +v_accvgpr_write acc15, 0 // initC + +v_mov_b64 v[6:7], 0 +v_mov_b64 v[8:9], 0 + +v_mfma_f32_32x32x16_bf16 acc[16:31], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[32:47], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[48:63], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[64:79], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[80:95], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[96:111], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[112:127], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[128:143], v[6:9], v[6:9], acc[0:15] + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +v_mfma_f32_32x32x16_bf16 acc[144:159], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[160:175], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[176:191], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[192:207], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[208:223], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[224:239], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[240:255], v[6:9], v[6:9], acc[0:15] + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +/* global read inc A loopL */ +s_add_u32 s86, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s86 // Is this wrapIter? (pf) +s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +/* global read inc B loopL */ +s_add_u32 s86, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s86 // Is this wrapIter? (pf) +s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + +/******************************************/ +/* End setupNewTile */ +/******************************************/ +label_ShadowInitStart: +s_mov_b64 s[sgprSrdD+0:sgprSrdD+0+1], s[sgprAddressD+0:sgprAddressD+0+1] // init SRD base address +s_mov_b32 s[sgprSrdD+2], BufferOOB +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +s_mov_b64 s[sgprSrdC+0:sgprSrdC+0+1], s[sgprAddressC+0:sgprAddressC+0+1] // init SRD base address +s_mov_b32 s[sgprSrdC+2], BufferOOB +s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD + + +s_mul_i32 s86, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s85, s86, s[sgprStrideC1J] // ScaleC s86 by Stride +s_mul_i32 s84, s86, s[sgprStrideC1J] // ScaleC s86 by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC] // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s85 // add hi to SRD +s_mul_hi_u32 s85, s86, s[sgprStrideD1J] // ScaleD s86 by Stride +s_mul_i32 s84, s86, s[sgprStrideD1J] // ScaleD s86 by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD] // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s85 // add hi to SRD + +s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC] // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s85 // add hi to SRD +s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD] // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85 // add hi to SRD + +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_2 // branch if GSU == 1 +// GSU Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * GSUIdx * bpe%s +s_mul_hi_u32 s85, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0 +s_mul_i32 s84, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0 +s_sub_u32 s86, s[sgprSizesFree+1], 1 // Free1 +s_mul_i32 s86, s86, s[sgprGSUSumIdx] // Free1 +s_mul_hi_u32 s87, s86, s[sgprStrideC1J] // Free1 +s_mul_i32 s86, s86, s[sgprStrideC1J] // Free1 +s_add_u32 s84, s84, s86 // Free1 +s_addc_u32 s85, s85, s87 // Free1 +s_sub_u32 s86, s[sgprSizesFree+2], 1 // Free2 +s_mul_i32 s86, s86, s[sgprGSUSumIdx] // Free2 +s_mul_hi_u32 s87, s86, s[sgprStrideCK] // Free2 +s_mul_i32 s86, s86, s[sgprStrideCK] // Free2 +s_add_u32 s84, s84, s86 // Free2 +s_addc_u32 s85, s85, s87 // Free2 +s_lshl_b64 s[84:85], s[84:85], 2 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84 // add lo GSU offset to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85 // add hi GSU offset to SRD +label_GSU_2: +.set sgprGSULog2BpeC, UNDEF +.set sgprAddressC, UNDEF + +s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? + +/* after InitC, skip to end of prefetch last iter if numIter==0 */ +s_cbranch_scc0 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc1 +s_getpc_b64 s[84:85] // addr of next instr +s_add_i32 s86, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s84, s84, s86 // add target branch offset +s_addc_u32 s85, s85, 0 // add high and carry +s_setpc_b64 s[84:85] // branch to label_PrefetchGlobalLastIterEnd +label_NoBranch_T8JHFHKM7BO5OHXW: + +/* local write a */ + +/* local write b */ + +/* local write swap a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* local write swap b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // PGR=2 but only 1 loop +s_cbranch_scc1 label_skipPGR2 // PGR=2 but only 1 loop +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address + +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + + +/* local write swap a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* local write swap b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR + + +label_skipPGR2: + +s_waitcnt vmcnt(24) +s_barrier + +/* local read prefetch a */ +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +s_waitcnt vmcnt(16) +s_barrier + +/* local read prefetch b */ +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +s_waitcnt lgkmcnt(0) + +/******************************************/ +/* Unrolled Loop(s) - Begin */ +/******************************************/ +label_openLoopL: +s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // LoopCounterL < EndCounter +s_cbranch_scc1 label_toPGR1 // PGR=2 but only 1 loop, toPGR1 +s_cmp_le_u32 s[sgprLoopCounterL], 0x2 // LoopCounterL < EndCounter +s_cbranch_scc1 label_LoopEndL // do not enter LoopL + +label_LoopBeginL: + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* global read inc A loopL */ +s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? +s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +s_cselect_b32 s85, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +/* global read inc B loopL */ +s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUB+1], 0 // incUpper <- ? +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +/* mfmaIndex:16 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +/* mfmaIndex:17 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +/* mfmaIndex:18 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:19 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +s_waitcnt lgkmcnt(0) // wait for A local reads +/* mfmaIndex:21 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +s_barrier + +/* mfmaIndex:22 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +/* mfmaIndex:23 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:24 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:25 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +/* mfmaIndex:26 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:27 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:28 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +/* mfmaIndex:29 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:30 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:31 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +/* mfmaIndex:32 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:33 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:34 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +/* mfmaIndex:35 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:36 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:37 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:39 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:41 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + /* mfmaIndex:43 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] + /* mfmaIndex:47 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +s_waitcnt lgkmcnt(0) +/* mfmaIndex:51 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +s_barrier + +/* mfmaIndex:52 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +/* mfmaIndex:53 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:54 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +/* mfmaIndex:56 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:57 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] + +/* mfmaIndex:58 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:59 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +/* mfmaIndex:60 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] + +/* mfmaIndex:61 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 + +/* mfmaIndex:62 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:63 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] + +// Iteration one + +/* mfmaIndex:64 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* local write swap offsets a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] + +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] + +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 + +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] + + +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_waitcnt vmcnt(13) // wait for previous set of global reads + +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +s_barrier + +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 + +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + + +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] + +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 + +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] + +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] + +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] + +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] + + +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] + + + +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] + + + +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] + +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] + + +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] + + +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] + + +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] + + +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] + +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] + +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* local write swap offsets b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +s_cmp_eq_i32 s[sgprLoopCounterL], 0x2 // counterL==2 +s_waitcnt lgkmcnt(0) + +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +s_cbranch_scc0 label_LoopBeginL // restart LoopL +label_LoopEndL: + +/* Before NLL: Check VGPR.checkin for INT8 LW */ + +/******************************************/ +/* Ord. NoGlobalLoadLoop - Begin */ +/******************************************/ + +/* iter 0 (reset local read pointers iteration) (swap local read pointers iteration) */ +/* grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] + + +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] + + +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] + +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] + +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + + +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] + +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] + +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] + +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] + +/* mfmaIndex:16 */ +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ + +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk + +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* local read init pointers a */ + +/* localReadInitPointers */ + +/* local read init pointers b */ + +/* localReadInitPointers */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ + +/* iter 1 (swap and reset local write pointers iteration) */ +/* grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ + +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +s_waitcnt vmcnt(0) // wait for global reads with lds + +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +s_barrier +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=1 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ +label_toPGR1: +s_and_b32 s8, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s8, 1 // GSU == 1 ? +s_cbranch_scc0 label_GSU_3 // branch if GSU != 1 + +/******************************************/ +/* Opt. NoLoadLoop - Begin */ +/******************************************/ +s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 +s_cbranch_scc0 label_OptNLL_End // Branch if Beta is not zero + +s_cmp_eq_u32 s[sgprAlpha], 1.0 // Alpha == 1.0 ? +s_cbranch_scc0 label_OptNLL_End // branch if alpha != 1 + +s_and_b32 s84, 255, s[sgprSizeI] // s84 = s[sgprSizeI] % 256 +s_add_u32 s85, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s85 // wg0 >= nwg0-1 ? +s_cselect_b32 s84, s84, 0 // set rMT0 +s_cmpk_gt_u32 s84, 0 // rMT0 > 0 +s_cbranch_scc1 label_OptNLL_End // jump if edges required +s_and_b32 s84, 255, s[sgprSizeJ] // s84 = s[sgprSizeJ] % 256 +s_add_u32 s85, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s85 // wg1 >= nwg1-1 +s_cselect_b32 s84, s84, 0 // set rMT1 +s_cmpk_gt_u32 s84, 0 // rMT1 > 0 +s_cbranch_scc1 label_OptNLL_End // jump if edges required + + + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] + +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] + +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] + +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] + +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] + +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] + +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] + +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] + +/* mfmaIndex:16 */ +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ + +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] + +/* iter 1 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ +label_toPGR1end_OptNLL: +/* Stores for OptNLL */ +label_Summation_End_OptNLL: +/* endSummation: add vgpr [0...132) to pool */ +/* load store sgprs */ + +/* Mapping of Acc register -> C Vgpr register */ +/* computeStoreVgprs */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v5, 1, v4 // 5 = 4 / 2 +v_mul_lo_u32 v5, 0x10, v5 // wave coordination offset 1 +v_and_b32 v1, 63, v[vgprSerial] // v1 = v[vgprSerial] % 64 +v_lshrrev_b32 v1, 4, v1 // 1 = 1 / 16 +v_lshlrev_b32 v1, 2, v1 // thread0 * continuous_output +v_add_lshl_u32 v1, v5, v1, 3 // coordination 1 = vwB *(wave_id1 + tid1) +v_mul_lo_u32 v2, v1, s[sgprStrideC1J] // offset 1 +v_mul_lo_u32 v3, v1, s[sgprStrideD1J] // offset 1 +v_and_b32 v0, 1, v4 // v0 = v4 % 2 +v_mul_lo_u32 v0, 0x10, v0 // wave coordination offset 0 +v_and_b32 v5, 15, v[vgprSerial] // v5 = v[vgprSerial] % 16 +v_add_lshl_u32 v0, v5, v0, 3 // coordination 0 = vwA * (wave_id0 + tid0) +s_mul_i32 s8, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v0, s8, v0 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s8, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v1, s8, v1 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 + +/******************************************/ +/* Global Write Elements */ +/******************************************/ +label_GW_B0_E0: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_lshl_u32 v11, v3, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+216], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+217], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+218], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+219], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+220], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+221], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+222], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+223], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+224], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+225], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+226], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+227], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+228], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+229], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+230], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+231], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+232], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+233], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+234], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+235], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+236], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+237], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+238], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+239], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+240], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+241], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+242], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+243], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+244], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+245], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+246], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+247], acc127 // copy acc to vreg[223] + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v8, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v9, 0x7fff0000 // fp32 Nan +v_mov_b32 v10, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v216, v[vgprValuC+216], v[vgprValuC+217] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v217, v[vgprValuC+218], v[vgprValuC+219] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v218, v[vgprValuC+220], v[vgprValuC+221] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v219, v[vgprValuC+222], v[vgprValuC+223] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v224, v[vgprValuC+224], v[vgprValuC+225] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v225, v[vgprValuC+226], v[vgprValuC+227] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v226, v[vgprValuC+228], v[vgprValuC+229] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v227, v[vgprValuC+230], v[vgprValuC+231] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v232, v[vgprValuC+232], v[vgprValuC+233] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v233, v[vgprValuC+234], v[vgprValuC+235] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v234, v[vgprValuC+236], v[vgprValuC+237] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v235, v[vgprValuC+238], v[vgprValuC+239] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v240, v[vgprValuC+240], v[vgprValuC+241] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v241, v[vgprValuC+242], v[vgprValuC+243] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v242, v[vgprValuC+244], v[vgprValuC+245] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v243, v[vgprValuC+246], v[vgprValuC+247] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[240:243], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+16], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+20], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+21], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+22], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+23], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+24], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+25], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+26], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+27], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+28], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+29], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+30], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+31], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+32], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+33], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+34], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+35], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+36], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+37], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+38], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+39], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+40], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+41], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+42], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+43], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+44], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+45], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+46], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+47], acc255 // copy acc to vreg[255] + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v8, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v9, 0x7fff0000 // fp32 Nan +v_mov_b32 v10, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End // jump to end +label_GW_End: + +s_endpgm // Kernel End +label_OptNLL_End: +label_GSU_3: + +/******************************************/ +/* Ord. NoLoadLoop - Begin */ +/******************************************/ + +/* iter 0 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:0 */ +s_waitcnt lgkmcnt(7) // wait for prior local read local write old=0, new=7 newLW=0 newLR=7 for iteration == 0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:1 */ +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:2 */ +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:3 */ +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:4 */ +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:5 */ +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:6 */ +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:7 */ +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:8 */ +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +s_waitcnt lgkmcnt(8) // wait for prior local read local write +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:9 */ +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:10 */ +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:11 */ +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:12 */ +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:13 */ +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:14 */ +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:15 */ +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:16 */ +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ + +/* iter 1 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ +label_toPGR1end_OrdNLL: +label_PrefetchGlobalLastIterEnd: + +/* Tail: add ValuA/B vgpr buffer [4...132) to pool */ + +/* Tail: add address/G2L vgpr [132...132) to pool */ +label_Summation_End_S4FDBQ587JJL6NOU: +.set sgprWGM, UNDEF +.set sgprLoopCounterL, UNDEF +.set sgprOrigLoopCounter, UNDEF +.set sgprAddressA, UNDEF +.set sgprAddressB, UNDEF +.set sgprStridesA, UNDEF +.set sgprStridesB, UNDEF +.set sgprStaggerUIter, UNDEF +.set sgprSrdA, UNDEF +.set sgprSrdB, UNDEF +.set sgprShadowLimitA, UNDEF +.set sgprShadowLimitB, UNDEF +.set sgprWrapUA, UNDEF +.set sgprWrapUB, UNDEF +.set sgprGlobalReadIncsA, UNDEF +.set sgprGlobalReadIncsB, UNDEF +.set sgprScalarGlobalReadOffsetA, UNDEF +.set sgprScalarGlobalReadOffsetB, UNDEF +/* load store sgprs */ + +/* Mapping of Acc register -> C Vgpr register */ + +/* not-LocalSplitU: global write indices */ +/* computeStoreVgprs */ +v_lshrrev_b32 v8, 6, v[vgprSerial] // 8 = Serial / 64 +v_lshrrev_b32 v9, 1, v8 // 9 = 8 / 2 +v_mul_lo_u32 v9, 0x10, v9 // wave coordination offset 1 +v_and_b32 v5, 63, v[vgprSerial] // v5 = v[vgprSerial] % 64 +v_lshrrev_b32 v5, 4, v5 // 5 = 5 / 16 +v_lshlrev_b32 v5, 2, v5 // thread0 * continuous_output +v_add_lshl_u32 v5, v9, v5, 3 // coordination 1 = vwB *(wave_id1 + tid1) +v_mul_lo_u32 v6, v5, s[sgprStrideC1J] // offset 1 +v_mul_lo_u32 v7, v5, s[sgprStrideD1J] // offset 1 +v_and_b32 v4, 1, v8 // v4 = v8 % 2 +v_mul_lo_u32 v4, 0x10, v4 // wave coordination offset 0 +v_and_b32 v9, 15, v[vgprSerial] // v9 = v[vgprSerial] % 16 +v_add_lshl_u32 v4, v9, v4, 3 // coordination 0 = vwA * (wave_id0 + tid0) +s_mul_i32 s8, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v4, s8, v4 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s8, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v5, s8, v5 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 + +/* not-LocalSplitU: global write */ + +/******************************************/ +/* Global Write Elements */ +/******************************************/ +s_and_b32 s8, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s8, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_4 // branch if GSU == 1 +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B0_E1_M // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B0_E1_N // jump if edges required +label_GW_B0_E0_1: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=26 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_lshl_u32 v15, v7, v4, 0x2 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+27], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+28], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+29], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+30], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+31], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+32], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+33], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+34], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+35], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+36], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+37], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+38], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+39], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+40], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+41], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+42], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+43], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+44], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+45], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+46], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+47], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+48], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+49], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+50], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+51], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+52], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+53], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+54], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+55], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+56], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+57], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+58], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+59], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+60], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+61], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+62], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+63], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+64], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+65], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+66], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+67], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+68], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+69], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+70], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+71], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+72], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+73], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+74], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+75], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+76], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+77], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+78], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+79], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+80], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+81], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+82], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+83], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+84], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+85], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+86], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+87], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+88], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+89], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+90], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+91], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+92], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+93], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+94], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+95], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+96], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+97], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+98], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+99], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+100], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+101], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+102], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+103], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+104], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+105], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+106], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+107], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+108], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+109], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+110], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+111], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+112], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+113], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+114], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+115], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+116], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+117], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+118], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+119], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+120], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+121], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+122], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+123], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+124], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+125], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+126], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+127], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+136], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+137], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+138], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+139], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+140], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+141], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+142], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+143], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+144], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+145], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+146], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+147], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+148], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+149], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+150], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+151], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+152], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+153], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+154], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+155], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+156], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+157], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+158], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+159], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+160], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+161], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+162], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+163], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+164], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+165], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+166], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+167], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+168], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+169], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+170], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+171], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+172], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+173], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+174], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+175], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+176], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+177], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+178], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+179], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+180], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+181], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+182], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+183], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+184], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+185], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+186], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+187], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+188], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+189], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+190], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+191], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+192], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+193], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+194], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+195], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+196], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+197], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+198], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+199], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+200], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+201], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+202], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+203], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+204], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+205], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+206], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+207], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+208], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+209], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+210], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+211], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+212], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+213], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+214], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+215], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+216], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+217], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+218], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+219], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+220], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+221], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+222], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+223], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+224], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+225], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+226], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+227], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+228], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+229], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+230], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+231], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+232], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+233], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+234], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+235], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+236], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+237], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+238], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+239], acc63 // copy acc to vreg[207] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[84:87], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[92:95], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[100:103], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[108:111], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[116:119], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[124:127], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[140:143], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[148:151], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[156:159], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[164:167], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[172:175], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[180:183], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[188:191], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[196:199], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[204:207], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[212:215], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[220:223], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[228:231], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[236:239], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+24], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+25], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+26], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+27], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+28], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+29], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+30], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+31], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+32], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+33], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+34], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+35], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+36], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+37], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+38], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+39], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+40], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+41], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+42], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+43], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+44], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+45], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+46], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+47], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+48], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+49], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+50], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+51], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+52], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+53], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+54], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+55], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+56], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+57], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+58], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+59], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+60], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+61], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+62], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+63], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+64], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+65], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+66], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+67], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+68], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+69], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+70], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+71], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ + +/* apply mask, calc new C and issue writes */ +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_B0_E1_N: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[24:27], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[32:35], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[40:43], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[48:51], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[56:59], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[64:67], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[72:75], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[80:83], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[84:87], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[88:91], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[92:95], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[96:99], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[100:103], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[104:107], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[108:111], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[112:115], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[116:119], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[120:123], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[124:127], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[136:139], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[140:143], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[144:147], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[148:151], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[152:155], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[156:159], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[160:163], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[164:167], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[168:171], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[172:175], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[176:179], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[180:183], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[184:187], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[188:191], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[192:195], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[196:199], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[200:203], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[204:207], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[208:211], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[212:215], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v81, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v81, v10, v81, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v83, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v83, v10, v83, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v84, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v85, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v85, v10, v85, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v86, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+17], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+18], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+19], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+20], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+21], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+22], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+23], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+24], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+25], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+26], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+27], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+28], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+29], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+30], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+31], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+32], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+33], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+34], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+35], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+36], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+37], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+38], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+39], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+40], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+41], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+42], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+43], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+44], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+45], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+46], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+47], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+48], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+49], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+50], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+51], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+52], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+53], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+54], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+55], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+56], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+57], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+58], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+59], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+60], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+61], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+62], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+63], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+64], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+65], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+66], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+67], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+68], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+69], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+70], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+71], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+72], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+73], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+74], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+75], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+76], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+77], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+78], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+79], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[24:27], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[32:35], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[40:43], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[48:51], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[56:59], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[64:67], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[72:75], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_B0_E1_M: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=114 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+16], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+17], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+18], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+19], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+20], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+21], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+22], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+23], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+24], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+25], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+26], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+27], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+28], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+29], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+30], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+31], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+32], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+33], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+34], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+35], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+36], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+37], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+38], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+39], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+40], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+41], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+42], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+43], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+44], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+45], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+46], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+47], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+48], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+49], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+50], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+51], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+52], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+53], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+54], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+55], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+56], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+57], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+58], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+59], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+60], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+61], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+62], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+63], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+64], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+65], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+66], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+67], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+68], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+69], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+70], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+71], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+72], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+73], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+74], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+75], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+76], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+77], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+78], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+79], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+80], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+81], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+82], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+83], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+84], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+85], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+86], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+87], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+88], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+89], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+90], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+91], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+92], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+93], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+94], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+95], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+96], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+97], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+98], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+99], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+100], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+101], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+102], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+103], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+104], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+105], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+106], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+107], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+108], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+109], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+110], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+111], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+112], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+113], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+114], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+115], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+116], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+117], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+118], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+119], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+120], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+121], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+122], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+123], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+124], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+125], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+126], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+127], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+128], acc197 // copy acc to vreg[113] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+16], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+17], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+18], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+19], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+20], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+21], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+22], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+23], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+24], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+25], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+26], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+27], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+28], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+29], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+30], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+31], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+32], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+33], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+34], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+35], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+36], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+37], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+38], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+39], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+40], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+41], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+42], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+43], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+44], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+45], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+46], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+47], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+48], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+49], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+50], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+51], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+52], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+53], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+54], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+55], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+56], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+57], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+58], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+59], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+60], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+61], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+62], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+63], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+64], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+65], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+66], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+67], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+68], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+69], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+70], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+71], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+72], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+73], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+74], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+75], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+76], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+77], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+78], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+79], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+80], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+81], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+82], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+83], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+84], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+85], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+86], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+87], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+88], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+89], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+90], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+91], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+92], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+93], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+94], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+95], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+96], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+97], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+98], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+99], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+100], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+101], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+102], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+103], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+104], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+105], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+106], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+107], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+108], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+109], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+110], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+111], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+112], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+113], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+114], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+115], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+116], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+117], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+118], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+119], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+120], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+121], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+122], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+123], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+124], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+125], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+126], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+127], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+128], acc143 // copy acc to vreg[227] + +/* rC *= alpha batchElements=[(0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v43, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v43, v10, v43, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v45, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v45, v10, v45, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v47, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v47, v10, v47, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v49, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v49, v10, v49, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v51, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v51, v10, v51, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v53, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v53, v10, v53, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v55, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v55, v10, v55, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v57, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v57, v10, v57, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v59, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v59, v10, v59, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v61, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v61, v10, v61, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v63, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v63, v10, v63, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v65, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v65, v10, v65, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v67, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v67, v10, v67, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v69, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v69, v10, v69, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+16], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+17], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+18], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+19], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+20], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+21], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+22], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+23], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+24], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+25], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+26], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+27], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+28], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+29], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+30], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+31], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+32], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+33], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+34], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+35], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+36], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+37], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+38], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+39], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+40], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+41], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+42], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v15, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v69, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_End_1: +s_getpc_b64 s[30:31] // addr of next instr +s_add_i32 s32, label_KernelEnd, 4 // target branch offset +s_add_u32 s30, s30, s32 // add target branch offset +s_addc_u32 s31, s31, 0 // add high and carry +s_setpc_b64 s[30:31] // branch to label_KernelEnd +label_GSU_4: +s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 +s_cbranch_scc0 label_GW_Beta_2 // Branch if Beta is not zero + +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B0_E1_M_1 // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B0_E1_N_1 // jump if edges required +label_GW_B0_E0_2: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=26 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_lshl_u32 v15, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+27], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+28], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+29], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+30], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+31], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+32], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+33], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+34], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+35], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+36], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+37], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+38], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+39], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+40], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+41], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+42], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+43], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+44], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+45], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+46], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+47], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+48], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+49], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+50], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+51], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+52], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+53], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+54], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+55], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+56], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+57], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+58], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+59], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+60], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+61], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+62], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+63], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+64], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+65], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+66], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+67], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+68], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+69], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+70], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+71], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+72], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+73], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+74], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+75], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+76], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+77], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+78], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+79], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+80], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+81], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+82], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+83], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+84], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+85], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+86], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+87], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+88], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+89], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+90], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+91], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+92], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+93], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+94], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+95], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+96], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+97], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+98], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+99], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+100], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+101], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+102], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+103], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+104], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+105], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+106], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+107], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+108], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+109], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+110], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+111], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+112], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+113], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+114], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+115], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+116], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+117], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+118], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+119], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+120], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+121], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+122], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+123], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+124], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+125], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+126], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+127], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+136], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+137], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+138], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+139], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+140], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+141], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+142], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+143], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+144], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+145], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+146], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+147], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+148], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+149], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+150], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+151], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+152], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+153], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+154], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+155], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+156], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+157], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+158], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+159], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+160], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+161], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+162], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+163], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+164], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+165], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+166], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+167], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+168], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+169], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+170], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+171], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+172], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+173], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+174], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+175], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+176], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+177], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+178], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+179], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+180], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+181], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+182], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+183], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+184], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+185], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+186], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+187], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+188], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+189], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+190], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+191], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+192], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+193], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+194], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+195], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+196], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+197], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+198], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+199], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+200], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+201], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+202], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+203], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+204], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+205], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+206], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+207], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+208], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+209], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+210], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+211], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+212], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+213], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+214], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+215], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+216], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+217], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+218], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+219], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+220], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+221], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+222], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+223], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+224], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+225], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+226], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+227], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+228], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+229], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+230], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+231], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+232], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+233], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+234], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+235], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+236], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+237], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+238], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+239], acc63 // copy acc to vreg[207] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+216:vgprValuC+216+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+216:vgprValuC+216+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+218:vgprValuC+218+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+218:vgprValuC+218+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+220:vgprValuC+220+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+220:vgprValuC+220+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+222:vgprValuC+222+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+222:vgprValuC+222+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+224:vgprValuC+224+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+224:vgprValuC+224+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+226:vgprValuC+226+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+226:vgprValuC+226+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+228:vgprValuC+228+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+228:vgprValuC+228+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+230:vgprValuC+230+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+230:vgprValuC+230+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+232:vgprValuC+232+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+232:vgprValuC+232+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+234:vgprValuC+234+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+234:vgprValuC+234+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+236:vgprValuC+236+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+236:vgprValuC+236+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+238:vgprValuC+238+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+238:vgprValuC+238+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v216, v[vgprValuC+216], v[vgprValuC+217] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v217, v[vgprValuC+218], v[vgprValuC+219] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v218, v[vgprValuC+220], v[vgprValuC+221] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v219, v[vgprValuC+222], v[vgprValuC+223] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v224, v[vgprValuC+224], v[vgprValuC+225] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v225, v[vgprValuC+226], v[vgprValuC+227] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v226, v[vgprValuC+228], v[vgprValuC+229] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v227, v[vgprValuC+230], v[vgprValuC+231] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v232, v[vgprValuC+232], v[vgprValuC+233] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v233, v[vgprValuC+234], v[vgprValuC+235] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v234, v[vgprValuC+236], v[vgprValuC+237] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v235, v[vgprValuC+238], v[vgprValuC+239] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+24], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+25], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+26], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+27], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+28], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+29], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+30], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+31], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+32], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+33], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+34], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+35], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+36], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+37], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+38], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+39], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+40], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+41], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+42], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+43], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+44], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+45], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+46], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+47], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+48], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+49], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+50], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+51], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+52], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+53], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+54], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+55], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+56], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+57], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+58], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+59], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+60], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+61], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+62], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+63], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+64], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+65], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+66], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+67], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+68], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+69], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+70], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+71], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B0_E1_N_1: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[80:83], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[88:91], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[96:99], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[104:107], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[112:115], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[120:123], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[136:139], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[144:147], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[152:155], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[160:163], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[168:171], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[176:179], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[184:187], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[192:195], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[200:203], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[208:211], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v81, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v81, v10, v81, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v83, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v83, v10, v83, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v84, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v85, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v85, v10, v85, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v86, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+17], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+18], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+19], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+20], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+21], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+22], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+23], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+24], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+25], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+26], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+27], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+28], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+29], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+30], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+31], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+32], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+33], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+34], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+35], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+36], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+37], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+38], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+39], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+40], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+41], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+42], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+43], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+44], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+45], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+46], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+47], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+48], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+49], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+50], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+51], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+52], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+53], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+54], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+55], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+56], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+57], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+58], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+59], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+60], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+61], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+62], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+63], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+64], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+65], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+66], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+67], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+68], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+69], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+70], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+71], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+72], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+73], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+74], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+75], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+76], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+77], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+78], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+79], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B0_E1_M_1: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=114 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+16], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+17], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+18], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+19], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+20], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+21], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+22], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+23], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+24], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+25], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+26], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+27], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+28], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+29], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+30], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+31], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+32], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+33], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+34], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+35], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+36], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+37], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+38], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+39], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+40], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+41], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+42], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+43], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+44], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+45], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+46], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+47], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+48], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+49], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+50], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+51], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+52], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+53], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+54], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+55], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+56], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+57], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+58], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+59], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+60], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+61], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+62], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+63], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+64], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+65], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+66], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+67], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+68], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+69], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+70], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+71], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+72], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+73], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+74], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+75], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+76], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+77], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+78], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+79], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+80], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+81], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+82], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+83], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+84], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+85], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+86], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+87], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+88], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+89], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+90], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+91], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+92], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+93], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+94], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+95], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+96], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+97], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+98], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+99], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+100], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+101], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+102], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+103], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+104], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+105], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+106], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+107], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+108], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+109], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+110], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+111], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+112], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+113], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+114], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+115], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+116], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+117], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+118], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+119], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+120], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+121], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+122], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+123], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+124], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+125], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+126], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+127], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+128], acc197 // copy acc to vreg[113] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+128], s[sgprAlpha], v[vgprValuC+128] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v91, v[vgprValuC+91], v[vgprValuC+91] // convert C to bf16 in gwvw==1 +buffer_store_short v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v92, v[vgprValuC+92], v[vgprValuC+92] // convert C to bf16 in gwvw==1 +buffer_store_short v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v93, v[vgprValuC+93], v[vgprValuC+93] // convert C to bf16 in gwvw==1 +buffer_store_short v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v94, v[vgprValuC+94], v[vgprValuC+94] // convert C to bf16 in gwvw==1 +buffer_store_short v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v95, v[vgprValuC+95], v[vgprValuC+95] // convert C to bf16 in gwvw==1 +buffer_store_short v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+96] // convert C to bf16 in gwvw==1 +buffer_store_short v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v97, v[vgprValuC+97], v[vgprValuC+97] // convert C to bf16 in gwvw==1 +buffer_store_short v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v98, v[vgprValuC+98], v[vgprValuC+98] // convert C to bf16 in gwvw==1 +buffer_store_short v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v99, v[vgprValuC+99], v[vgprValuC+99] // convert C to bf16 in gwvw==1 +buffer_store_short v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v100, v[vgprValuC+100], v[vgprValuC+100] // convert C to bf16 in gwvw==1 +buffer_store_short v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v101, v[vgprValuC+101], v[vgprValuC+101] // convert C to bf16 in gwvw==1 +buffer_store_short v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v102, v[vgprValuC+102], v[vgprValuC+102] // convert C to bf16 in gwvw==1 +buffer_store_short v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v103, v[vgprValuC+103], v[vgprValuC+103] // convert C to bf16 in gwvw==1 +buffer_store_short v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+104] // convert C to bf16 in gwvw==1 +buffer_store_short v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v105, v[vgprValuC+105], v[vgprValuC+105] // convert C to bf16 in gwvw==1 +buffer_store_short v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v106, v[vgprValuC+106], v[vgprValuC+106] // convert C to bf16 in gwvw==1 +buffer_store_short v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v107, v[vgprValuC+107], v[vgprValuC+107] // convert C to bf16 in gwvw==1 +buffer_store_short v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v108, v[vgprValuC+108], v[vgprValuC+108] // convert C to bf16 in gwvw==1 +buffer_store_short v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v109, v[vgprValuC+109], v[vgprValuC+109] // convert C to bf16 in gwvw==1 +buffer_store_short v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v110, v[vgprValuC+110], v[vgprValuC+110] // convert C to bf16 in gwvw==1 +buffer_store_short v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v111, v[vgprValuC+111], v[vgprValuC+111] // convert C to bf16 in gwvw==1 +buffer_store_short v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+112] // convert C to bf16 in gwvw==1 +buffer_store_short v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v113, v[vgprValuC+113], v[vgprValuC+113] // convert C to bf16 in gwvw==1 +buffer_store_short v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v114, v[vgprValuC+114], v[vgprValuC+114] // convert C to bf16 in gwvw==1 +buffer_store_short v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v115, v[vgprValuC+115], v[vgprValuC+115] // convert C to bf16 in gwvw==1 +buffer_store_short v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v116, v[vgprValuC+116], v[vgprValuC+116] // convert C to bf16 in gwvw==1 +buffer_store_short v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v117, v[vgprValuC+117], v[vgprValuC+117] // convert C to bf16 in gwvw==1 +buffer_store_short v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v118, v[vgprValuC+118], v[vgprValuC+118] // convert C to bf16 in gwvw==1 +buffer_store_short v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v119, v[vgprValuC+119], v[vgprValuC+119] // convert C to bf16 in gwvw==1 +buffer_store_short v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+120] // convert C to bf16 in gwvw==1 +buffer_store_short v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v121, v[vgprValuC+121], v[vgprValuC+121] // convert C to bf16 in gwvw==1 +buffer_store_short v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v122, v[vgprValuC+122], v[vgprValuC+122] // convert C to bf16 in gwvw==1 +buffer_store_short v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v123, v[vgprValuC+123], v[vgprValuC+123] // convert C to bf16 in gwvw==1 +buffer_store_short v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v124, v[vgprValuC+124], v[vgprValuC+124] // convert C to bf16 in gwvw==1 +buffer_store_short v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v125, v[vgprValuC+125], v[vgprValuC+125] // convert C to bf16 in gwvw==1 +buffer_store_short v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v126, v[vgprValuC+126], v[vgprValuC+126] // convert C to bf16 in gwvw==1 +buffer_store_short v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v127, v[vgprValuC+127], v[vgprValuC+127] // convert C to bf16 in gwvw==1 +buffer_store_short v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v128, v[vgprValuC+128], v[vgprValuC+128] // convert C to bf16 in gwvw==1 +buffer_store_short v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+16], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+17], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+18], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+19], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+20], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+21], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+22], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+23], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+24], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+25], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+26], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+27], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+28], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+29], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+30], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+31], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+32], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+33], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+34], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+35], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+36], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+37], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+38], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+39], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+40], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+41], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+42], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+43], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+44], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+45], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+46], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+47], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+48], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+49], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+50], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+51], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+52], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+53], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+54], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+55], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+56], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+57], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+58], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+59], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+60], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+61], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+62], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+63], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+64], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+65], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+66], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+67], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+68], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+69], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+70], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+71], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+72], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+73], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+74], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+75], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+76], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+77], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+78], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+79], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+80], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+81], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+82], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+83], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+84], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+85], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+86], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+87], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+88], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+89], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+90], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+91], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+92], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+93], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+94], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+95], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+96], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+97], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+98], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+99], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+100], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+101], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+102], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+103], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+104], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+105], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+106], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+107], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+108], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+109], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+110], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+111], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+112], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+113], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+114], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+115], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+116], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+117], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+118], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+119], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+120], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+121], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+122], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+123], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+124], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+125], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+126], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+127], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+128], acc143 // copy acc to vreg[227] + +/* rC *= alpha batchElements=[(0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+128], s[sgprAlpha], v[vgprValuC+128] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v91, v[vgprValuC+91], v[vgprValuC+91] // convert C to bf16 in gwvw==1 +buffer_store_short v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v92, v[vgprValuC+92], v[vgprValuC+92] // convert C to bf16 in gwvw==1 +buffer_store_short v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v93, v[vgprValuC+93], v[vgprValuC+93] // convert C to bf16 in gwvw==1 +buffer_store_short v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v94, v[vgprValuC+94], v[vgprValuC+94] // convert C to bf16 in gwvw==1 +buffer_store_short v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v95, v[vgprValuC+95], v[vgprValuC+95] // convert C to bf16 in gwvw==1 +buffer_store_short v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+96] // convert C to bf16 in gwvw==1 +buffer_store_short v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v97, v[vgprValuC+97], v[vgprValuC+97] // convert C to bf16 in gwvw==1 +buffer_store_short v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v98, v[vgprValuC+98], v[vgprValuC+98] // convert C to bf16 in gwvw==1 +buffer_store_short v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v99, v[vgprValuC+99], v[vgprValuC+99] // convert C to bf16 in gwvw==1 +buffer_store_short v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v100, v[vgprValuC+100], v[vgprValuC+100] // convert C to bf16 in gwvw==1 +buffer_store_short v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v101, v[vgprValuC+101], v[vgprValuC+101] // convert C to bf16 in gwvw==1 +buffer_store_short v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v102, v[vgprValuC+102], v[vgprValuC+102] // convert C to bf16 in gwvw==1 +buffer_store_short v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v103, v[vgprValuC+103], v[vgprValuC+103] // convert C to bf16 in gwvw==1 +buffer_store_short v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+104] // convert C to bf16 in gwvw==1 +buffer_store_short v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v105, v[vgprValuC+105], v[vgprValuC+105] // convert C to bf16 in gwvw==1 +buffer_store_short v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v106, v[vgprValuC+106], v[vgprValuC+106] // convert C to bf16 in gwvw==1 +buffer_store_short v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v107, v[vgprValuC+107], v[vgprValuC+107] // convert C to bf16 in gwvw==1 +buffer_store_short v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v108, v[vgprValuC+108], v[vgprValuC+108] // convert C to bf16 in gwvw==1 +buffer_store_short v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v109, v[vgprValuC+109], v[vgprValuC+109] // convert C to bf16 in gwvw==1 +buffer_store_short v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v110, v[vgprValuC+110], v[vgprValuC+110] // convert C to bf16 in gwvw==1 +buffer_store_short v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v111, v[vgprValuC+111], v[vgprValuC+111] // convert C to bf16 in gwvw==1 +buffer_store_short v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+112] // convert C to bf16 in gwvw==1 +buffer_store_short v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v113, v[vgprValuC+113], v[vgprValuC+113] // convert C to bf16 in gwvw==1 +buffer_store_short v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v114, v[vgprValuC+114], v[vgprValuC+114] // convert C to bf16 in gwvw==1 +buffer_store_short v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v115, v[vgprValuC+115], v[vgprValuC+115] // convert C to bf16 in gwvw==1 +buffer_store_short v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v116, v[vgprValuC+116], v[vgprValuC+116] // convert C to bf16 in gwvw==1 +buffer_store_short v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v117, v[vgprValuC+117], v[vgprValuC+117] // convert C to bf16 in gwvw==1 +buffer_store_short v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v118, v[vgprValuC+118], v[vgprValuC+118] // convert C to bf16 in gwvw==1 +buffer_store_short v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v119, v[vgprValuC+119], v[vgprValuC+119] // convert C to bf16 in gwvw==1 +buffer_store_short v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+120] // convert C to bf16 in gwvw==1 +buffer_store_short v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v121, v[vgprValuC+121], v[vgprValuC+121] // convert C to bf16 in gwvw==1 +buffer_store_short v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v122, v[vgprValuC+122], v[vgprValuC+122] // convert C to bf16 in gwvw==1 +buffer_store_short v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v123, v[vgprValuC+123], v[vgprValuC+123] // convert C to bf16 in gwvw==1 +buffer_store_short v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v124, v[vgprValuC+124], v[vgprValuC+124] // convert C to bf16 in gwvw==1 +buffer_store_short v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v125, v[vgprValuC+125], v[vgprValuC+125] // convert C to bf16 in gwvw==1 +buffer_store_short v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v126, v[vgprValuC+126], v[vgprValuC+126] // convert C to bf16 in gwvw==1 +buffer_store_short v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v127, v[vgprValuC+127], v[vgprValuC+127] // convert C to bf16 in gwvw==1 +buffer_store_short v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v128, v[vgprValuC+128], v[vgprValuC+128] // convert C to bf16 in gwvw==1 +buffer_store_short v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v43, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v43, v10, v43, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v45, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v45, v10, v45, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v47, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v47, v10, v47, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v49, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v49, v10, v49, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v51, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v51, v10, v51, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v53, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v53, v10, v53, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v55, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v55, v10, v55, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v57, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v57, v10, v57, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v59, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v59, v10, v59, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v61, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v61, v10, v61, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v63, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v63, v10, v63, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v65, v10, v65, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v67, v10, v67, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v69, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v69, v10, v69, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+16], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+17], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+18], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+19], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+20], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+21], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+22], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+23], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+24], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+25], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+26], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+27], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+28], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+29], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+30], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+31], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+32], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+33], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+34], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+35], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+36], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+37], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+38], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+39], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+40], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+41], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+42], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+42], s[sgprAlpha], v[vgprValuC+42] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v69, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_Beta_2: +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required +label_GW_B1_E0: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=18 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_add_lshl_u32 v16, v6, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +buffer_load_dwordx4 v[20:23], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[128:131], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[176:179], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[180:183], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[184:187], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[188:191], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[192:195], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[196:199], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[200:203], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[204:207], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[208:211], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[212:215], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[216:219], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[220:223], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[224:227], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[228:231], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[232:235], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[236:239], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v15, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+27], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+28], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+29], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+30], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+31], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+32], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+33], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+34], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+35], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+36], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+37], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+38], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+39], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+40], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+41], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+42], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+43], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+44], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+45], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+46], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+47], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+48], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+49], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+50], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+51], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+52], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+53], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+54], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+55], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+56], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+57], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+58], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+59], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+60], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+61], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+62], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+63], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+64], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+65], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+66], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+67], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+68], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+69], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+70], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+71], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+72], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+73], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+74], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+75], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+76], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+77], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+78], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+79], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+80], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+81], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+82], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+83], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+84], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+85], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+86], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+87], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+88], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+89], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+90], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+91], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+92], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+93], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+94], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+95], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+96], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+97], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+98], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+99], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+100], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+101], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+102], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+103], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+104], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+105], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+106], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+107], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+108], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+109], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+110], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+111], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+112], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+113], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+114], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+115], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+116], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+117], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+118], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+119], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+120], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+121], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+122], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+123], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+124], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+125], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+126], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+127], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+136], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+137], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+138], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+139], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+140], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+141], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+142], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+143], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+144], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+145], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+146], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+147], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+148], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+149], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+150], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+151], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+152], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+153], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+154], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+155], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+156], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+157], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+158], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+159], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+160], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+161], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+162], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+163], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+164], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+165], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+166], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+167], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+168], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+169], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+170], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+171], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+172], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+173], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+174], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+175], acc62 // copy acc to vreg[143] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 + +s_waitcnt vmcnt(17) // vmcnt(17) = 18 - 1 (beta) (interleaved) +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(16) = 18 - 2 (beta) (interleaved) +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(15) = 18 - 3 (beta) (interleaved) +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(14) = 18 - 4 (beta) (interleaved) +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(13) = 18 - 5 (beta) (interleaved) +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(12) = 18 - 6 (beta) (interleaved) +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(11) = 18 - 7 (beta) (interleaved) +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(10) = 18 - 8 (beta) (interleaved) +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(9) = 18 - 9 (beta) (interleaved) +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v201 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v201 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v203 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v203 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(8) = 18 - 10 (beta) (interleaved) +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(7) = 18 - 11 (beta) (interleaved) +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(6) = 18 - 12 (beta) (interleaved) +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(5) = 18 - 13 (beta) (interleaved) +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(4) = 18 - 14 (beta) (interleaved) +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v221 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v221 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v223 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v223 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(3) = 18 - 15 (beta) (interleaved) +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+144], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+145], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+146], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+147], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+148], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+149], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+150], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+151], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(2) = 18 - 16 (beta) (interleaved) +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+152], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+153], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v229 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+154], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v229 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+155], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+156], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+157], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v231 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+158], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v231 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+159], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(1) = 18 - 17 (beta) (interleaved) +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+160], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+161], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v233 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+162], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v233 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+163], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+164], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+165], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v235 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+166], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v235 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+167], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(0) = 18 - 18 (beta) (interleaved) +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+168], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+169], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v237 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+170], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v237 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+171], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+172], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+173], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v239 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+174], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v239 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+175], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[20:23], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[128:131], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[144:147], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[148:151], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[152:155], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[156:159], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[160:163], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[164:167], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[168:171], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[172:175], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[176:179], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[180:183], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[184:187], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[188:191], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+25], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+26], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+27], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+28], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+29], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+30], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+31], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+32], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+33], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+34], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+35], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+36], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+37], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+38], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+39], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+40], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+41], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+42], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+43], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+44], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+45], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+46], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+47], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+48], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+49], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+50], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+51], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+52], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+53], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+54], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+55], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+56], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+57], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+58], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+59], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+60], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+61], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+62], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+63], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+64], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+65], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+66], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+67], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+68], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+69], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+70], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+71], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+72], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+73], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+74], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+75], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+76], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+77], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+78], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+79], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+80], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+81], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+82], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+83], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+84], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+85], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+86], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+87], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+88], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+89], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+90], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+91], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+92], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+93], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+94], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+95], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+96], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+97], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+98], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+99], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+100], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+101], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+102], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+103], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+104], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+105], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+106], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+107], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+108], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+109], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+110], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+111], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+112], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+113], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+114], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+115], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+116], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+117], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+118], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+119], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+120], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+121], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+122], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+123], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+124], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+125], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+126], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+127], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+136], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+137], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+138], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+139], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+140], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+141], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+142], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+143], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 + +s_waitcnt vmcnt(13) // vmcnt(13) = 14 - 1 (beta) (interleaved) +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(12) = 14 - 2 (beta) (interleaved) +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(11) = 14 - 3 (beta) (interleaved) +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v145 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v147 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v147 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(10) = 14 - 4 (beta) (interleaved) +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v149 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v149 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v151 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(9) = 14 - 5 (beta) (interleaved) +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(8) = 14 - 6 (beta) (interleaved) +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(7) = 14 - 7 (beta) (interleaved) +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v161 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v161 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v163 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(6) = 14 - 8 (beta) (interleaved) +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(5) = 14 - 9 (beta) (interleaved) +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(4) = 14 - 10 (beta) (interleaved) +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(3) = 14 - 11 (beta) (interleaved) +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(2) = 14 - 12 (beta) (interleaved) +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(1) = 14 - 13 (beta) (interleaved) +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(0) = 14 - 14 (beta) (interleaved) +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B1_E1_N: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=16 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[128:131], v15, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[152:155], v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[156:159], v160, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[164:167], v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[168:171], v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[172:175], v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[176:179], v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v180, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[184:187], v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[188:191], v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v182, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[192:195], v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[196:199], v200, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v200, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[204:207], v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[208:211], v202, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[212:215], v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[216:219], v220, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[224:227], v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[80:83], v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[88:91], v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[96:99], v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[104:107], v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[112:115], v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[120:123], v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[136:139], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+144], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+145], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+146], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+147], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+148], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+149], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+150], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+151], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[144:147], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[128:131], v15, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[152:155], v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[156:159], v160, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[164:167], v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[168:171], v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[172:175], v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[176:179], v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v180, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[184:187], v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[188:191], v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v182, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[192:195], v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[196:199], v200, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v200, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[204:207], v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[208:211], v202, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[212:215], v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[216:219], v220, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[224:227], v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+17], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+18], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+19], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+20], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+21], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+22], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+23], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+24], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+25], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+26], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+27], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+28], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+29], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+30], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+31], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+32], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+33], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+34], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+35], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+36], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+37], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+38], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+39], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+40], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+41], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+42], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+43], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+44], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+45], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+46], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+47], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+48], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+49], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+50], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+51], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+52], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+53], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+54], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+55], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+56], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+57], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+58], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+59], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+60], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+61], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+62], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+63], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+64], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+65], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+66], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+67], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+68], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+69], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+70], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+71], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+72], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+73], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+74], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+75], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+76], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+77], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+78], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+79], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+80], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+81], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+82], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+83], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+84], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+85], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+86], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+87], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+88], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+89], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+90], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+91], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+92], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+93], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+94], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+95], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+96], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+97], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+98], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+99], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+100], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+101], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+102], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+103], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+104], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+105], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+106], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+107], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+108], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+109], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+110], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+111], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+112], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+113], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+114], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+115], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+116], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+117], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+118], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+119], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+120], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+121], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+122], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+123], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+124], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+125], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+126], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+127], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+136], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+137], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+138], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+139], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+140], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+141], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+142], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+143], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+144], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+145], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+146], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+147], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+148], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+149], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+150], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+151], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[80:83], v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[88:91], v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[96:99], v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[104:107], v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[112:115], v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[120:123], v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[136:139], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+144], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+145], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+146], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+147], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+148], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+149], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+150], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+151], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[144:147], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B1_E1_M: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=76 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+16], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+17], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+18], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+19], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+20], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+21], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+22], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+23], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+24], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+25], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+26], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+27], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+28], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+29], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+30], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+31], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+32], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+33], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+34], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+35], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+36], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+37], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+38], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+39], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+40], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+41], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+42], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+43], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+44], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+45], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+46], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+47], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+48], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+49], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+50], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+51], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+52], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+53], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+54], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+55], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+56], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+57], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+58], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+59], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+60], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+61], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+62], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+63], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+64], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+65], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+66], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+67], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+68], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+69], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+70], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+71], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+72], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+73], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+74], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+75], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+76], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+77], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+78], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+79], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+80], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+81], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+82], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+83], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+84], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+85], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+86], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+87], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+88], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+89], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+90], acc45 // copy acc to vreg[75] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v99 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v101 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v105 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v107 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v111 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v113 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v117 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v119 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v123 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v125 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v138 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v140 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v240 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v242 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v244 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+16], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+17], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+18], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+19], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+20], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+21], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+22], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+23], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+24], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+25], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+26], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+27], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+28], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+29], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+30], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+31], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+32], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+33], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+34], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+35], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+36], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+37], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+38], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+39], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+40], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+41], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+42], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+43], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+44], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+45], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+46], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+47], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+48], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+49], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+50], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+51], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+52], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+53], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+54], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+55], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+56], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+57], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+58], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+59], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+60], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+61], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+62], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+63], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+64], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+65], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+66], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+67], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+68], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+69], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+70], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+71], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+72], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+73], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+74], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+75], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+76], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+77], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+78], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+79], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+80], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+81], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+82], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+83], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+84], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+85], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+86], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+87], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+88], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+89], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+90], acc94 // copy acc to vreg[151] + +/* rC *= alpha batchElements=[(0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v99 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v101 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v105 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v107 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v111 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v113 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v117 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v119 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v123 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v125 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v138 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v140 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v240 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v242 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v244 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+16], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+17], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+18], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+19], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+20], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+21], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+22], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+23], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+24], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+25], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+26], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+27], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+28], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+29], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+30], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+31], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+32], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+33], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+34], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+35], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+36], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+37], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+38], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+39], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+40], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+41], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+42], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+43], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+44], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+45], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+46], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+47], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+48], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+49], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+50], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+51], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+52], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+53], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+54], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+55], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+56], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+57], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+58], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+59], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+60], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+61], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+62], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+63], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+64], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+65], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+66], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+67], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+68], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+69], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+70], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+71], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+72], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+73], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+74], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+75], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+76], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+77], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+78], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+79], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+80], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+81], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+82], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+83], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+84], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+85], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+86], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+87], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+88], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+89], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+90], acc143 // copy acc to vreg[227] + +/* rC *= alpha batchElements=[(0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v99 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v101 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v105 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v107 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v111 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v113 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v117 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v119 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v123 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v125 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v138 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v140 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v240 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v242 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v244 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */ +/* (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v43, v44, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v44, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v45, v46, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v46, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v47, v48, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v49, v50, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v50, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v51, v52, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v52, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v53, v54, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v54, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v55, v56, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v56, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v57, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v59, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v61, v62, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v62, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v63, v64, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v64, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v65, v66, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v66, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v67, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v68, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v69, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v72, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v72, v10, v72, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v71, v72, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v72, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v72, v10, v72, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v74, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v74, v10, v74, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v73, v74, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v74, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v74, v10, v74, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v76, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v76, v10, v76, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v75, v76, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v76, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v76, v10, v76, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v78, v10, v78, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v77, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v78, v10, v78, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v79, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v80, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v81, v82, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v82, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v84, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v83, v84, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v84, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v86, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v85, v86, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v86, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v88, v10, v88, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v87, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v88, v10, v88, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v89, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+16], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+17], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+18], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+19], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+20], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+21], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+22], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+23], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+24], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+25], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+26], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+27], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+28], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+29], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+30], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+31], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+32], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+33], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+34], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+35], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+36], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+37], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+38], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+39], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+40], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+41], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+42], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+42], s[sgprAlpha], v[vgprValuC+42] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v43 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v45 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v47 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v49 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v51 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v53 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v55 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v59 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v61 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v63 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v65 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v69 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v71 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v73 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v75 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v79 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v81 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v83 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v85 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v89 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_End_2: +label_KernelEnd: +s_endpgm // Kernel End +label_ASM_End: /// The end of the kernel diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950.s new file mode 100644 index 00000000000..fcb19ec994f --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950.s @@ -0,0 +1,19343 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx950" +.text +.protected Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 +.globl Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 +.p2align 8 +.type Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_accum_offset 248 // accvgpr offset + .amdhsa_next_free_vgpr 504 // vgprs + .amdhsa_next_free_sgpr 88 // sgprs + .amdhsa_group_segment_fixed_size 133120 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_user_sgpr_count 13 + .amdhsa_user_sgpr_kernarg_preload_length 11 + .amdhsa_user_sgpr_kernarg_preload_offset 0 +.end_amdhsa_kernel +.text +/* Num VGPR =248 */ +/* Num AccVGPR=256 */ +/* Num SGPR =88 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 32 x 8 */ +/* SubGroup= 8 x 32 */ +/* VectorWidthA=8 */ +/* VectorWidthB=8 */ +/* GlobalReadVectorWidthA=8, GlobalReadVectorWidthB=8 */ +/* DirectToLdsA=True */ +/* DirectToLdsB=True */ +/* UseSgprForGRO=1 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 + ProblemType: + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: False +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950 + .symbol: 'Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + .group_segment_fixed_size: 133120 + .kernarg_segment_align: 8 + .kernarg_segment_size: 104 + .max_flat_workgroup_size: 256 + .private_segment_fixed_size: 0 + .sgpr_count: 88 + .sgpr_spill_count: 0 + .vgpr_count: 248 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950: +label_ASM_Start: /// Main body of the asm kernel +.macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req + v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber + v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA + v_add_u32 v[\vgprDstIdx+0], v[\vgprDstIdx+0], v[\vgprDstIdx+1] + v_lshrrev_b32 v[\vgprDstIdx+0], \magicShift, v[\vgprDstIdx+0] +.endm + +/******************************************/ +/* VGPR Assignments */ +/******************************************/ +/* ValuC range: [0-0), serializedStore enabled */ +.set vgprValuC, 0 +/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ +.set vgprBase, 4 +.set vgprGlobalReadOffsetA, 0 +.set vgprGlobalReadOffsetB, 1 +.set vgprLocalReadAddrA, 2 +.set vgprLocalReadAddrB, 3 +.set vgprLocalReadSwapAddrA, 132 +.set vgprLocalReadSwapAddrB, 133 +.set vgprSerial, 134 + +/******************************************/ +/* VGPR Macro Assignments */ +/******************************************/ +.set vgprValuA_X0_I0_BASE, vgprBase+0 +.set vgprValuB_X0_I0_BASE, vgprBase+64 +.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0 +.set vgprValuA_X1_I0, vgprValuA_X0_I0_BASE+32 +.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0 +.set vgprValuB_X1_I0, vgprValuB_X0_I0_BASE+32 + +/******************************************/ +/* SGPR Assignments */ +/******************************************/ +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprWGM, 11 +.set sgprLoopCounterL, 12 +.set sgprOrigLoopCounter, 13 +.set sgprSrdD, 16 +.set sgprSrdC, 20 +.set sgprNumWorkGroups0, 14 +.set sgprNumWorkGroups1, 15 +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprLocalWriteAddrA, 46 +.set sgprLocalWriteAddrB, 47 +.set sgprSwapA, 48 +.set sgprSwapB, 49 +.set sgprGSU, 50 + +/* Size Assignments */ +.set sgprSizeI, sgprSizesFree+0 +.set sgprSizeJ, sgprSizesFree+1 +.set sgprSizeK, sgprSizesFree+2 +.set sgprSizeL, sgprSizesSum+0 + +/* Stride Assignments */ +.set constStrideD0I, 1 +.set sgprStrideD1J, sgprStridesD+0 +.set sgprStrideDK, sgprStridesD+1 +.set constStrideC0I, 1 +.set sgprStrideC1J, sgprStridesC+0 +.set sgprStrideCK, sgprStridesC+1 +.set constStrideAL, 1 +.set sgprStrideA0I, sgprStridesA+0 +.set sgprStrideAK, sgprStridesA+1 +.set constStrideBL, 1 +.set sgprStrideB1J, sgprStridesB+0 +.set sgprStrideBK, sgprStridesB+1 + +.set MT0, 256 +.set MT1, 256 +.set DepthU, 64 +.set BpeA, 2 +.set BpeALog2, 1 +.set BpeB, 2 +.set BpeBLog2, 1 +.set BpeAGR, 2 +.set BpeAGRLog2, 1 +.set BpeBGR, 2 +.set BpeBGRLog2, 1 +/* Number of elements to shift-left SRD */ +.set SrdShiftLeftA, 8 +.set SrdShiftLeftB, 8 +/* 2GB limit - set offsets to -1 to exceed this and clamp */ +.set BufferLimit, 0xffffffff +.set BufferOOB, 0x80000000 + +/******************************************/ +/* Bits 127:96 of SRD. */ +/* hex: 0x20000 */ +/* dst_sel_x (3b): 0 */ +/* dst_sel_y (3b): 0 */ +/* dst_sel_z (3b): 0 */ +/* dst_sel_w (3b): 0 */ +/* num_format (3b): 0 */ +/* data_format (4b): 4 */ +/* user_vm_enable (1b): 0 */ +/* user_vm_mode (1b): 0 */ +/* index_stride (2b): 0 */ +/* add_tid_enable (1b): 0 */ +/* _unusedA (3b): 0 */ +/* nv (1b): 0 */ +/* _unusedB (2b): 0 */ +/* type (2b): 0 */ +/******************************************/ +.set Srd127_96, 0x20000 + +/* Global Offset A */ +.macro GLOBAL_OFFSET_A vgprAddr:req, vgprOffsetL:req, vgprOffset0I:req, vgprTmp:req + v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower + v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower + v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0] // add prepad for pointer shift + v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0] // offset *= bytes/element +.endm + +/* Global Offset B */ +.macro GLOBAL_OFFSET_B vgprAddr:req, vgprOffsetL:req, vgprOffset1J:req, vgprTmp:req + v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower + v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower + v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0] // add prepad for pointer shift + v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0] // offset *= bytes/element +.endm + +/******************************************/ +/* Allocate Resources */ +/******************************************/ + +/* Load num of Gemms */ +s_load_dword s51, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 + +/* Load packed kernel args (StaggerU/GSU) */ +s_load_dword s53, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 + +/* Load num of WGs */ +s_load_dword s54, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_waitcnt lgkmcnt(0) // load args +s_lshr_b32 s52, s51, 0x1e // Get arg type +s_and_b32 s51, 0x3fffffff, s51 // Get nums of gemm +s_cmp_eq_u32 s52, 0 // Is kernel args +s_cbranch_scc0 label_HBMArgs +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_waitcnt lgkmcnt(0) // preload +s_branch label_LoadArgsEnd +label_HBMArgs: + +/* Load address of kernel arguments */ +s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 +s_waitcnt lgkmcnt(0) // wait for args to load +label_LoadArgsEnd: +s_branch label_common_kernel_entry + +/* pad 37 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +label_Preload_Offset_Start: +s_and_b32 s51, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s52, s2, 0x1e // Get arg type +s_mov_b32 s53, s3 // Preload internal args +s_cmp_eq_u32 s52, 0 // Is kernel args +s_cbranch_scc0 label_Preload_HBMArgs +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dword s31, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 +s_load_dwordx8 s[32:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_mov_b64 s[24:25], s[6:7] // move preload data to correct sgpr +s_mov_b64 s[26:27], s[8:9] // move preload data to correct sgpr +s_mov_b64 s[28:29], s[10:11] // move preload data to correct sgpr +s_mov_b32 s30, s12 // move preload data to correct sgpr +s_branch label_Preload_LoadArgsEnd +label_Preload_HBMArgs: +s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments +label_Preload_LoadArgsEnd: +s_mov_b32 s[sgprWGM], s4 // Preload internal args2 +s_mov_b32 s54, s5 // Load num of WGs +label_common_kernel_entry: /// for both preload/non-preload common code +s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id +s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id +s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id +s_and_b32 s[sgprStaggerU], s53, 0xffff0000 // Restore StaggerU related vars +s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 +s_and_b32 s[sgprGSU], s53, 0xffff // Restore GSUConfig and GSU +s_mov_b32 s[sgprArgType], s52 +s_mov_b32 m0, 0x20800 // LDS clamp at 133120 bytes +v_mov_b32 v[vgprSerial], v0 // thread serial id + +/* remap workgroup to XCCs */ +s_lshr_b32 s60, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s60, s60 // Get log(WGMXCC) +s_lshr_b32 s61, s[sgprWGM], 0x16 // Get CU_Count +/* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ +s_cmp_gt_i32 s60, 0 +s_cbranch_scc0 label_skip_WGMXCC +/* only remap WGs in the range */ +s_lshr_b32 s57, s54, s60 +s_lshl_b32 s57, s57, s60 +s_cmp_ge_u32 s[sgprWorkGroup0], s57 +s_cbranch_scc1 label_skip_WGMXCC +s_cmp_eq_u32 s61, 0 // CU_Count == 0 ? +s_cbranch_scc0 label_XCCG_nonzero +s_lshr_b32 s57, s[sgprWorkGroup0], s60 +s_bfm_b32 s58, s60, 0 +s_and_b32 s58, s[sgprWorkGroup0], s58 +s_lshr_b32 s59, s54, s60 +s_mul_i32 s58, s58, s59 +s_add_u32 s[sgprWorkGroup0], s57, s58 +s_branch label_skip_WGMXCC +label_XCCG_nonzero: +/* temp0 = (wg//CU_Count)*CU_Count */ +v_cvt_f32_u32 v10, s61 // wg//CU_Count +v_rcp_iflag_f32 v10, v10 // wg//CU_Count +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // wg//CU_Count +v_mul_f32 v10, v10, v11 // wg//CU_Count +v_cvt_u32_f32 v10, v10 // wg//CU_Count +v_mul_u32_u24 v11, v10, s61 // wg//CU_Count +v_sub_u32 v11, s[sgprWorkGroup0], v11 // wg//CU_Count +v_cmpx_eq_u32 exec, v11, s61 // wg//CU_Count +v_add_u32 v10, 1, v10 // wg//CU_Count +v_mov_b32 v11, 0 // wg//CU_Count +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s61 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s61 // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup0], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s57, v10 // quotient +v_readfirstlane_b32 s58, v11 // remainder +s_mul_i32 s57, s57, s61 +/* temp1 = (wg%CU_Count)//WGMXCC */ +s_lshr_b32 s58, s58, s60 +/* temp0 = temp0 + temp1 */ +s_add_u32 s57, s57, s58 +/* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ +v_cvt_f32_u32 v10, s61 // WGs//CU_Count +v_rcp_iflag_f32 v10, v10 // WGs//CU_Count +v_cvt_f32_u32 v11, s54 // WGs//CU_Count +v_mul_f32 v10, v10, v11 // WGs//CU_Count +v_cvt_u32_f32 v10, v10 // WGs//CU_Count +v_mul_u32_u24 v11, v10, s61 // WGs//CU_Count +v_sub_u32 v11, s54, v11 // WGs//CU_Count +v_cmpx_eq_u32 exec, v11, s61 // WGs//CU_Count +v_add_u32 v10, 1, v10 // WGs//CU_Count +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s61 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s58, v10 // quotient +s_mul_i32 s58, s58, s61 +s_sub_u32 s59, s54, s58 +s_cmp_gt_u32 s[sgprWorkGroup0], s58 +s_cselect_b32 s58, s59, s61 +s_lshr_b32 s58, s58, s60 +s_bfm_b32 s59, s60, 0 +s_and_b32 s59, s[sgprWorkGroup0], s59 +s_mul_i32 s58, s58, s59 +/* WorkGroup0 = temp0 + temp1 */ +s_add_u32 s[sgprWorkGroup0], s57, s58 +label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap +//s_mov_b32 s[sgprWorkGroup0], 0 + +/* init: add vgpr [4...136) to pool */ +/* init: add vgpr [0...0) to pool */ +/* init: add agpr [0...256) to pool */ + +/******************************************/ +/* Local Read Addresses */ +/******************************************/ + +/* local read addresses: tile assignments a/b */ +/* lr0I */ +v_and_b32 v5, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v4, 15, v5 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v4, 6, v4 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v4, 3, v4 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v5, 4, v5 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v4, v5, 3, v4 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v8, 6, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64) +v_and_b32 v8, 1, v8 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v4, v8, 13, v4 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset +/* lr1J */ +v_and_b32 v6, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v5, 15, v6 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v5, 6, v5 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v5, 3, v5 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v6, 4, v6 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v5, v6, 3, v5 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v7, 7, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128) +v_and_b32 v7, 1, v7 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v5, v7, 13, v5 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset + +/* local read addresses: final offsets a */ +v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 +v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id +s_mov_b32 s53, 64 // LSU offset: stride = lsuStride(64) when umlds==True +v_mul_lo_u32 v6, s53, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1 // Final Offset: offset = (lro0+lsuoffset)*bpeDS +v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 + +/* local read addresses: final offsets b */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id + // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.) +v_mul_lo_u32 v4, s53, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1 // Final Offset: offset = (lro1+lsuoffset)*bpeDS +v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 + +/* local read addresses: declare addresses a */ +/* N/A */ + +/* local read addresses: declare addresses b */ +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8200, v[vgprLocalReadAddrB+0] // += LdsOffsetB (lower) +v_add_u32 v[vgprLocalReadSwapAddrA], 66560, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping +v_add_u32 v[vgprLocalReadSwapAddrB], 66560, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping + +/******************************************/ +/* Local Write Addresses */ +/******************************************/ +/* LVCA = 8 */ +/* v5 = A-unroll = serial%LVCA */ +v_lshrrev_b32 v4, 3, v[vgprSerial] // 4 = Serial / 8 +v_and_b32 v5, 7, v[vgprSerial] // 5 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v5, 3, v5 // v5 = v5 * 8 +v_mov_b32 v8, v5 // copy for GlobalSplitU +/* LVCB = 8 */ +/* v7 = B-unroll = serial%LVCB */ +v_lshrrev_b32 v6, 3, v[vgprSerial] // 6 = Serial / 8 +v_and_b32 v7, 7, v[vgprSerial] // 7 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v7, 3, v7 // v7 = v7 * 8 +v_mov_b32 v9, v7 // copy for GlobalSplitU +/* lwaUnrollAssignmentA = v8 */ +/* lwaUnrollAssignmentB = v9 */ + +/* local write addresses: first offset a */ +v_mul_u32_u24 v10, 0x40, v4 // lwAL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v8, v10, 0x1 // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrA], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping + +/* local write addresses: first offset b */ +v_mul_u32_u24 v10, 0x40, v6 // lwBL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v9, v10, 0x1 // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +v_add_co_u32 v10, vcc, 0x8200, v10 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=33280 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrB], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping +v_mov_b32 v12, MT0 // set MT0 into sgpr +v_mov_b32 v11, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +v_mov_b32 v12, MT1 // set MT1 into sgpr +v_mov_b32 v11, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v10 // set back to numWorkGroup0 +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +s_nop 0 // 1 wait states +v_readfirstlane_b32 s[sgprNumWorkGroups1], v10 // set back to numWorkGroup1 +s_waitcnt lgkmcnt(0) // wait for 44/0 bytes of kern args + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s52, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s53, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s52, s52, s53 +v_cvt_f32_u32 v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s52 +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_mul_u32_u24 v11, v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s52 +v_cmpx_eq_u32 exec, v11, s52 // s52 = s[sgprWorkGroup0] / s52 +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s52 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s52 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup2], s52 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s52, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s52, s52, s[sgprWorkGroup2] +s_mul_i32 s52, s52, s53 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup1], s52 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s52, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 + +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 62 +.set sgprStaggerUIter, 51 +.set sgprWrapUA, 64 +.set sgprWrapUB, 66 +.set sgprGlobalReadIncsA, 68 +.set sgprGlobalReadIncsB, 69 +.set sgprScalarGlobalReadOffsetA, 70 +.set sgprScalarGlobalReadOffsetB, 77 +s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift +s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift +s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift +s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift + +/* Short circuit condition if Alpha == 0, then sumDims=0 */ +v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? +s_cbranch_vccz label_AlphaNonZero // branch if s[Alpha] != 0 +s_mov_b32 s[sgprSizesSum+0], 0 // Set summation dim=0 if Alpha == 0 +label_AlphaNonZero: + +/******************************************/ +/* Begin setupNewTile */ +/******************************************/ + +/* global read addresses: work-group */ +/* graWorkGroup mapping */ +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU // branch if GSU == 1 +// GSU-not-WGMapRR :nwg1 = (size1J + MT1J - 1) / MT1J; +s_and_b32 s84, s[sgprGSU], 0x4000 // SCC = (GSUWGMRR == 1) ? +s_cbranch_scc1 label_GSUWGMRR // branch if GSUWGMRR == 1 +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_sub_u32 v11, s[sgprWorkGroup1], v11 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mov_b32 v11, 0 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup1], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v10 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx], v11 // remainder +s_branch label_GSUWGMRR_End +label_GSUWGMRR: +v_cvt_f32_u32 v10, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_rcp_iflag_f32 v10, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_f32 v10, v10, v11 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_u32_f32 v10, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_sub_u32 v11, s[sgprWorkGroup1], v11 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_add_u32 v10, 1, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mov_b32 v11, 0 // s[sgprWorkGroup1] = s[sgprWorkGroup1] % s[sgprNumWorkGroups1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups1] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1] // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup1], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprGSUSumIdx], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v11 // remainder +label_GSUWGMRR_End: +s_mov_b32 s[sgprGSULog2BpeC], 1 +s_mov_b32 s[sgprGSULog2BpeD], 2 +s_branch label_GSU_End +label_GSU: +s_mov_b64 s[sgprGSUSumIdx:sgprGSUSumIdx+1], 0 // Set GSUSumIdx to 0 +s_mov_b32 s[sgprGSULog2BpeC], 1 +s_mov_b32 s[sgprGSULog2BpeD], 1 +label_GSU_End: +s_sext_i32_i16 s[sgprWGM], s[sgprWGM] // Restore WGM +s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? +s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 +s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? +s_cbranch_scc1 label_WGM // branch if WGM >= 0 +s_abs_i32 s[sgprWGM], s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprWorkGroup0], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s86, v10 // quotient +s_mul_i32 s87, s86, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s87, s[sgprWorkGroup0], s87 // WorkGroup0=remainder +s_mul_i32 s87, s87, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s87, s87, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprNumWorkGroups0] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprNumWorkGroups0], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s84, v10 // quotient +s_mul_i32 s85, s[sgprWGM], s84 // quotient * non-magic divisor +s_sub_u32 s85, s[sgprNumWorkGroups0], s85 // NumWorkGroups0=remainder +s_cmp_eq_u32 s85, 0 // remainder == 0 ? +s_cmov_b32 s85, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s86, s84 // blockId >= numFullBlocks ? +s_cselect_b32 s84, s85, s[sgprWGM] +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup1] = s87 / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup1] = s87 / s84 +v_cvt_f32_u32 v11, s87 // s[sgprWorkGroup1] = s87 / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup1] = s87 / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup1] = s87 / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup1] = s87 / s84 +v_sub_u32 v11, s87, v11 // s[sgprWorkGroup1] = s87 / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup1] = s87 / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup1] = s87 / s84 +v_mov_b32 v11, 0 // s[sgprWorkGroup0] = s87 % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s87, v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v11 // remainder +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s84 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s87, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s86, s86, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s86 // wg1 += blockId * WGM +s_branch label_WGM +label_WGMPositive: +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprWorkGroup1], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s86, v10 // quotient +s_mul_i32 s87, s86, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s87, s[sgprWorkGroup1], s87 // WorkGroup1=remainder +s_mul_i32 s87, s87, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s87, s87, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprNumWorkGroups1], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s84, v10 // quotient +s_mul_i32 s85, s[sgprWGM], s84 // quotient * non-magic divisor +s_sub_u32 s85, s[sgprNumWorkGroups1], s85 // NumWorkGroups1=remainder +s_cmp_eq_u32 s85, 0 // remainder == 0 ? +s_cmov_b32 s85, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s86, s84 // blockId >= numFullBlocks ? +s_cselect_b32 s84, s85, s[sgprWGM] +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup0] = s87 / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup0] = s87 / s84 +v_cvt_f32_u32 v11, s87 // s[sgprWorkGroup0] = s87 / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup0] = s87 / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup0] = s87 / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup0] = s87 / s84 +v_sub_u32 v11, s87, v11 // s[sgprWorkGroup0] = s87 / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup0] = s87 / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup0] = s87 / s84 +v_mov_b32 v11, 0 // s[sgprWorkGroup1] = s87 % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s87, v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup0], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v11 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s84 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s87, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s86, s86, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s86 // wg1 += blockId * WGM +label_WGM: + +/* global read addresses: tile offset assignment a */ +/* graTileAssignmentA = v4 */ + +/* global read addresses: tile offset assignment b */ +/* graTileAssignmentB = v6 */ + +/* global read addresses: unroll assignment a */ +/* v5 */ + +/* global read addresses: unroll assignment b */ +/* v7 */ + +/* global read addresses: other free assignments */ +/* s[sgprWorkGroup2] */ + +/* global read addresses: tile offsets a */ + +/* global read addresses: tile offsets b */ + +/* global read addresses: unroll offsets a */ + +/* global read addresses: unroll offsets b */ + +/* global read addresses: final offsets a */ +GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0, 5, 4, 10 // gROA_0_0_0_0 +s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 32 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+0], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 64 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+1], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 96 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+2], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], s[sgprStrideA0I], 128 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+3], s[sgprScalarGlobalReadOffsetA+3], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+4], s[sgprStrideA0I], 160 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+4], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+5], s[sgprStrideA0I], 192 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+5], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+6], s[sgprStrideA0I], 224 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+6], 0x1 // scalar offset *= bytes/element + +/* global read addresses: final offsets b */ +GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0, 7, 6, 10 // gROB_0_0_0_0 +s_mul_i32 s[sgprScalarGlobalReadOffsetB+0], s[sgprStrideB1J], 32 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+0], s[sgprScalarGlobalReadOffsetB+0], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+1], s[sgprStrideB1J], 64 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+1], s[sgprScalarGlobalReadOffsetB+1], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+2], s[sgprStrideB1J], 96 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+2], s[sgprScalarGlobalReadOffsetB+2], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+3], s[sgprStrideB1J], 128 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+3], s[sgprScalarGlobalReadOffsetB+3], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+4], s[sgprStrideB1J], 160 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+4], s[sgprScalarGlobalReadOffsetB+4], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+5], s[sgprScalarGlobalReadOffsetB+5], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+6], s[sgprScalarGlobalReadOffsetB+6], 0x1 // scalar offset *= bytes/element + +/* global read addresses: addresses a */ +/* max read offset = size[n] * stride[n-1] */ +s_mul_hi_u32 s87, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s86, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s87, s86, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s86, s86, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_A // branch if GSUC == 1 +s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s84, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_A_End +label_GSUC_A: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s84, 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s85, s85, s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s84, s84, s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s84, s84, s85 // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s85, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s84, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_A_End: +s_add_u32 s86, s86, s84 // accum GsuOffset term to tilestart +s_addc_u32 s87, s87, s85 // accum GsuOffset term to tilestart +s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size +s_sub_u32 s84, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s85, constStrideAL, s84 // stride x (size-1) +s_mul_i32 s84, constStrideAL, s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size +s_sub_u32 s84, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s85, s[sgprStrideA0I], s84 // stride x (size-1) +s_mul_i32 s84, s[sgprStrideA0I], s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s86 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s87 // sub tileStart +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x1 // Set limit to use bytes +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s85, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s84, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s86, s86, s84 // accum wg term to tilestart +s_addc_u32 s87, s87, s85 // accum wg term to tilestart +s_lshl_b64 s[86:87], s[86:87], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s86 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s87 // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +/* global read addresses: addresses b */ +/* max read offset = size[n] * stride[n-1] */ +s_mul_hi_u32 s87, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s86, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s87, s86, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s86, s86, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_B // branch if GSUC == 1 +s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s84, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_B_End +label_GSUC_B: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s84, 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s85, s85, s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s84, s84, s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s84, s84, s85 // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s85, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s84, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_B_End: +s_add_u32 s86, s86, s84 // accum GsuOffset term to tilestart +s_addc_u32 s87, s87, s85 // accum GsuOffset term to tilestart +s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size +s_sub_u32 s84, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s85, constStrideBL, s84 // stride x (size-1) +s_mul_i32 s84, constStrideBL, s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size +s_sub_u32 s84, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s85, s[sgprStrideB1J], s84 // stride x (size-1) +s_mul_i32 s84, s[sgprStrideB1J], s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s86 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s87 // sub tileStart +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x1 // Set limit to use bytes +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s85, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s84, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s86, s86, s84 // accum wg term to tilestart +s_addc_u32 s87, s87, s85 // accum wg term to tilestart +s_lshl_b64 s[86:87], s[86:87], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s86 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s87 // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +/* global read addresses: increments a */ +s_and_b32 s85, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s85, s85, DepthU*BpeAGR // GSU*DepthU*Bpe +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR, s85 // incrA (unrollIdx) + +/* global read addresses: increments b */ +s_and_b32 s85, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s85, s85, DepthU*BpeBGR // GSU*DepthU*Bpe +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR, s85 // incrB (unrollIdx) +/* declare loop num iterations */ +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 6 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 64 +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_1 // branch if GSU == 1 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_add_u32 s84, 1, s[sgprLoopCounterL] // tmp<-numIterMyWg+1 +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cmov_b32 s[sgprLoopCounterL], s84 // numIterMyWg++ if needed +label_GSU_1: +s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter +s_and_b32 s86, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s86, s86, 0x8 +s_and_b32 s87, s[sgprStaggerU], 0xe000 +s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff +s_mov_b32 s84, s[sgprStaggerU] // init staggerU +label_beginStaggerUIter: +s_lshl_b32 s85, s84, s86 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s85 // loopCount >= current shift Count +s_cbranch_scc1 label_endStaggerUIter // jump to end +s_lshr_b32 s84, s84, 1 // step down to smaller stagger +s_branch label_beginStaggerUIter // jump to begin +label_endStaggerUIter: +s_sub_u32 s85, s84, 1 // staggerU mask +s_cmp_ge_u32 s84, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s85, 0 // set Mask +s_cmp_eq_u32 s87, 0x0 +s_cbranch_scc1 label_StaggerUMapping_1 +s_mov_b32 s84, s[sgprWorkGroup0] +s_branch label_staggerInputEnd +label_StaggerUMapping_1: +s_cmp_eq_u32 s87, 0x2000 +s_cbranch_scc1 label_StaggerUMapping_2 +s_mov_b32 s84, s[sgprWorkGroup1] +s_branch label_staggerInputEnd +label_StaggerUMapping_2: +s_cmp_eq_u32 s87, 0x4000 +s_cbranch_scc1 label_StaggerUMapping_3 +s_mov_b32 s84, -0x1 +s_branch label_staggerInputEnd +label_StaggerUMapping_3: +s_cmp_eq_u32 s87, 0x6000 +s_cbranch_scc1 label_StaggerUMapping_4 +s_mul_i32 s85, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s84, s84, s85 +s_add_u32 s84, s84, s[sgprWorkGroup0] +s_branch label_staggerInputEnd +label_StaggerUMapping_4: +s_cmp_eq_u32 s87, 0x8000 +s_cbranch_scc1 label_staggerInputEnd +s_mov_b32 s84, -0x1 +s_branch label_staggerInputEnd +label_staggerInputEnd: +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s84 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s86 // shift by StaggerUStride + +/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ +s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop +s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop +s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration +s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ +s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop +s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop +s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration +s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap +/* local read addresses: init pointers a */ + +/* localReadInitPointers */ +/* local read addresses: init pointers b */ + +/* localReadInitPointers */ + +/* prefetch: global -> local */ +s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? +s_cbranch_scc1 label_ShadowInitStart // skip to ShadowInitStart iter b/c numIter==0 + +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +// Interleave Init C +v_accvgpr_write acc0, 0 // initC +v_accvgpr_write acc1, 0 // initC +v_accvgpr_write acc2, 0 // initC +v_accvgpr_write acc3, 0 // initC +v_accvgpr_write acc4, 0 // initC +v_accvgpr_write acc5, 0 // initC +v_accvgpr_write acc6, 0 // initC +v_accvgpr_write acc7, 0 // initC +v_accvgpr_write acc8, 0 // initC +v_accvgpr_write acc9, 0 // initC +v_accvgpr_write acc10, 0 // initC +v_accvgpr_write acc11, 0 // initC +v_accvgpr_write acc12, 0 // initC +v_accvgpr_write acc13, 0 // initC +v_accvgpr_write acc14, 0 // initC +v_accvgpr_write acc15, 0 // initC + +v_mov_b64 v[6:7], 0 +v_mov_b64 v[8:9], 0 + +v_mfma_f32_32x32x16_bf16 acc[16:31], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[32:47], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[48:63], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[64:79], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[80:95], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[96:111], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[112:127], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[128:143], v[6:9], v[6:9], acc[0:15] + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +v_mfma_f32_32x32x16_bf16 acc[144:159], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[160:175], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[176:191], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[192:207], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[208:223], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[224:239], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[240:255], v[6:9], v[6:9], acc[0:15] + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +/* global read inc A loopL */ +s_add_u32 s86, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s86 // Is this wrapIter? (pf) +s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +/* global read inc B loopL */ +s_add_u32 s86, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s86 // Is this wrapIter? (pf) +s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + +/******************************************/ +/* End setupNewTile */ +/******************************************/ +label_ShadowInitStart: +s_mov_b64 s[sgprSrdD+0:sgprSrdD+0+1], s[sgprAddressD+0:sgprAddressD+0+1] // init SRD base address +s_mov_b32 s[sgprSrdD+2], BufferOOB +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +s_mov_b64 s[sgprSrdC+0:sgprSrdC+0+1], s[sgprAddressC+0:sgprAddressC+0+1] // init SRD base address +s_mov_b32 s[sgprSrdC+2], BufferOOB +s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD + + +s_mul_i32 s86, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s85, s86, s[sgprStrideC1J] // ScaleC s86 by Stride +s_mul_i32 s84, s86, s[sgprStrideC1J] // ScaleC s86 by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC] // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s85 // add hi to SRD +s_mul_hi_u32 s85, s86, s[sgprStrideD1J] // ScaleD s86 by Stride +s_mul_i32 s84, s86, s[sgprStrideD1J] // ScaleD s86 by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD] // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s85 // add hi to SRD + +s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC] // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s85 // add hi to SRD +s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD] // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85 // add hi to SRD + +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_2 // branch if GSU == 1 +// GSU Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * GSUIdx * bpe%s +s_mul_hi_u32 s85, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0 +s_mul_i32 s84, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0 +s_sub_u32 s86, s[sgprSizesFree+1], 1 // Free1 +s_mul_i32 s86, s86, s[sgprGSUSumIdx] // Free1 +s_mul_hi_u32 s87, s86, s[sgprStrideC1J] // Free1 +s_mul_i32 s86, s86, s[sgprStrideC1J] // Free1 +s_add_u32 s84, s84, s86 // Free1 +s_addc_u32 s85, s85, s87 // Free1 +s_sub_u32 s86, s[sgprSizesFree+2], 1 // Free2 +s_mul_i32 s86, s86, s[sgprGSUSumIdx] // Free2 +s_mul_hi_u32 s87, s86, s[sgprStrideCK] // Free2 +s_mul_i32 s86, s86, s[sgprStrideCK] // Free2 +s_add_u32 s84, s84, s86 // Free2 +s_addc_u32 s85, s85, s87 // Free2 +s_lshl_b64 s[84:85], s[84:85], 2 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84 // add lo GSU offset to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85 // add hi GSU offset to SRD +label_GSU_2: +.set sgprGSULog2BpeC, UNDEF +.set sgprAddressC, UNDEF + +s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? + +/* after InitC, skip to end of prefetch last iter if numIter==0 */ +s_cbranch_scc0 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc1 +s_getpc_b64 s[84:85] // addr of next instr +s_add_i32 s86, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s84, s84, s86 // add target branch offset +s_addc_u32 s85, s85, 0 // add high and carry +s_setpc_b64 s[84:85] // branch to label_PrefetchGlobalLastIterEnd +label_NoBranch_T8JHFHKM7BO5OHXW: + +/* local write a */ + +/* local write b */ + +/* local write swap a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* local write swap b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // PGR=2 but only 1 loop +s_cbranch_scc1 label_skipPGR2 // PGR=2 but only 1 loop +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address + +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + + +/* local write swap a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* local write swap b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR + + +label_skipPGR2: + +s_waitcnt vmcnt(24) +s_barrier + +/* local read prefetch a */ +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +s_waitcnt vmcnt(16) +s_barrier + +/* local read prefetch b */ +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +s_waitcnt lgkmcnt(0) + +/******************************************/ +/* Unrolled Loop(s) - Begin */ +/******************************************/ +label_openLoopL: +s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // LoopCounterL < EndCounter +s_cbranch_scc1 label_toPGR1 // PGR=2 but only 1 loop, toPGR1 +s_cmp_le_u32 s[sgprLoopCounterL], 0x2 // LoopCounterL < EndCounter +s_cbranch_scc1 label_LoopEndL // do not enter LoopL + + +// MAIN LOOP MACRO - Shared code between Even/Odd simds +.macro MAINLOOP isOdd + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* global read inc A loopL */ +s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? +s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +s_cselect_b32 s85, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +/* global read inc B loopL */ +s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUB+1], 0 // incUpper <- ? +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +/* mfmaIndex:16 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +/* mfmaIndex:17 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +/* mfmaIndex:18 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:19 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +s_waitcnt lgkmcnt(0) // wait for A local reads +/* mfmaIndex:21 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +s_barrier + +.if \isOdd == 0 +////////////////////////////////////////////////////////////////////// EVEN WAVES +/* mfmaIndex:22 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 +/* mfmaIndex:23 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:24 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:25 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 , lds // G -> Reg 0_0_1_0 +/* mfmaIndex:26 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:27 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:28 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 , lds // G -> Reg 0_0_2_0 +/* mfmaIndex:29 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:30 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:31 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 , lds // G -> Reg 0_0_3_0 +/* mfmaIndex:32 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:33 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:34 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 , lds // G -> Reg 0_0_4_0 +/* mfmaIndex:35 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:36 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +.else +////////////////////////////////////////////////////////////////////// ODD WAVES +/* mfmaIndex:22 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:23 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 +/* mfmaIndex:24 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:25 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:26 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 , lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:27 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:28 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:29 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 , lds // G -> Reg 0_0_2_0 + +/* mfmaIndex:30 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:31 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:32 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 , lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:33 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:34 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:35 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 , lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:36 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +.endif ////////////////////////////////////////////////////////////////////// END branch + + +/* mfmaIndex:37 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:39 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:41 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + /* mfmaIndex:43 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] + /* mfmaIndex:47 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +s_waitcnt lgkmcnt(0) +/* mfmaIndex:51 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +s_barrier + +.if \isOdd == 0 +////////////////////////////////////////////////////////////////////// EVEN WAVES +/* mfmaIndex:52 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 , lds // G -> Reg 0_0_5_0 +/* mfmaIndex:53 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:54 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 , lds // G -> Reg 0_0_6_0 +/* mfmaIndex:56 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:57 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] + +/* mfmaIndex:58 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 , lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:59 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +/* mfmaIndex:60 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] + +/* mfmaIndex:61 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 + +/* mfmaIndex:62 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:63 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] + +// Iteration one + +/* mfmaIndex:64 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 , lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* local write swap offsets a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] + +.else +////////////////////////////////////////////////////////////////////// ODD WAVES +/* mfmaIndex:52 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] + +/* mfmaIndex:53 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 , lds // G -> Reg 0_0_5_0 + +/* mfmaIndex:54 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:55 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] + +/* mfmaIndex:56 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 , lds // G -> Reg 0_0_6_0 + +/* mfmaIndex:57 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:58 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] + + +/* mfmaIndex:59 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 , lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:60 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +/* mfmaIndex:61 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] + + +/* mfmaIndex:62 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 + +/* mfmaIndex:63 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +// Iteration one + +/* mfmaIndex:64 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] + + +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 , lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* local write swap offsets a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +.endif + + +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] + + +.if \isOdd == 0 + +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 , lds // G -> Reg 0_0_2_0 + +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 , lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 , lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] + + +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_waitcnt vmcnt(13) // wait for previous set of global reads + +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +s_barrier + +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 , lds // G -> Reg 0_0_5_0 + +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + + +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] + + +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 , lds // G -> Reg 0_0_6_0 + + +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] + +.else + +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] + +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 , lds // G -> Reg 0_0_2_0 + + +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 , lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 nt, lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] + + +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] + + +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_waitcnt vmcnt(13) // wait for previous set of global reads + +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +s_barrier + +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 nt, lds // G -> Reg 0_0_5_0 + + +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + + +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 nt, lds // G -> Reg 0_0_6_0 + +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] + +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] + +.endif + + +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 + + + +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] + + + +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] + +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] + + +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] + + + +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] + + + +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] + +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] + + +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] + + +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] + + +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] + + +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] + + +.if \isOdd == 0 + +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] + +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 , lds // G -> Reg 0_0_7_0 + +.else + +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 nt, lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] + +.endif + + +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* local write swap offsets b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +s_cmp_eq_i32 s[sgprLoopCounterL], 0x2 // counterL==2 +s_waitcnt lgkmcnt(0) + +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +.endm + + + +// EVEN SIMDID takes WVLoop0 path, ODD SIMDID takes other path +s_getreg_b32 s86, hwreg(HW_REG_HW_ID, 4, 1) +//s_and_b32 s86, s86, 1 +s_cmp_eq_u32 s86, 0 +s_cbranch_scc0 WVLoop1 + +/******************************************/ +/* Unrolled Loop 1/1 - Begin (Even SIMD) */ +/******************************************/ +WVLoop0: +label_LoopBeginL0: +MAINLOOP 0 +/* closeLoop loopL finalLoop=1 tailLoop=0 */ +s_cbranch_scc0 label_LoopBeginL0 // restart LoopL +s_branch label_LoopEndL + +/******************************************/ +/* Unrolled Loop 1/1 - Begin (Odd SIMD) */ +/******************************************/ +WVLoop1: +label_LoopBeginL1: +MAINLOOP 1 +/* closeLoop loopL finalLoop=1 tailLoop=0 */ +s_cbranch_scc0 label_LoopBeginL1 // restart LoopL + +label_LoopEndL: + +/* Before NLL: Check VGPR.checkin for INT8 LW */ + +/******************************************/ +/* Ord. NoGlobalLoadLoop - Begin */ +/******************************************/ + +/* iter 0 (reset local read pointers iteration) (swap local read pointers iteration) */ +/* grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] + + +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] + + +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] + +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] + +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + + +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] + +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] + +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] + +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] + +/* mfmaIndex:16 */ +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ + +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk + +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* local read init pointers a */ + +/* localReadInitPointers */ + +/* local read init pointers b */ + +/* localReadInitPointers */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ + +/* iter 1 (swap and reset local write pointers iteration) */ +/* grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ + +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +s_waitcnt vmcnt(0) // wait for global reads with lds + +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +s_barrier +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=1 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ +label_toPGR1: +s_and_b32 s8, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s8, 1 // GSU == 1 ? +s_cbranch_scc0 label_GSU_3 // branch if GSU != 1 + +/******************************************/ +/* Opt. NoLoadLoop - Begin */ +/******************************************/ +s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 +s_cbranch_scc0 label_OptNLL_End // Branch if Beta is not zero + +s_cmp_eq_u32 s[sgprAlpha], 1.0 // Alpha == 1.0 ? +s_cbranch_scc0 label_OptNLL_End // branch if alpha != 1 + +s_and_b32 s84, 255, s[sgprSizeI] // s84 = s[sgprSizeI] % 256 +s_add_u32 s85, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s85 // wg0 >= nwg0-1 ? +s_cselect_b32 s84, s84, 0 // set rMT0 +s_cmpk_gt_u32 s84, 0 // rMT0 > 0 +s_cbranch_scc1 label_OptNLL_End // jump if edges required +s_and_b32 s84, 255, s[sgprSizeJ] // s84 = s[sgprSizeJ] % 256 +s_add_u32 s85, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s85 // wg1 >= nwg1-1 +s_cselect_b32 s84, s84, 0 // set rMT1 +s_cmpk_gt_u32 s84, 0 // rMT1 > 0 +s_cbranch_scc1 label_OptNLL_End // jump if edges required + + + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] + +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] + +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] + +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] + +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] + +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] + +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] + +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] + +/* mfmaIndex:16 */ +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ + +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] + +/* iter 1 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ +label_toPGR1end_OptNLL: +/* Stores for OptNLL */ +label_Summation_End_OptNLL: +/* endSummation: add vgpr [0...132) to pool */ +/* load store sgprs */ + +/* Mapping of Acc register -> C Vgpr register */ +/* computeStoreVgprs */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v5, 1, v4 // 5 = 4 / 2 +v_mul_lo_u32 v5, 0x10, v5 // wave coordination offset 1 +v_and_b32 v1, 63, v[vgprSerial] // v1 = v[vgprSerial] % 64 +v_lshrrev_b32 v1, 4, v1 // 1 = 1 / 16 +v_lshlrev_b32 v1, 2, v1 // thread0 * continuous_output +v_add_lshl_u32 v1, v5, v1, 3 // coordination 1 = vwB *(wave_id1 + tid1) +v_mul_lo_u32 v2, v1, s[sgprStrideC1J] // offset 1 +v_mul_lo_u32 v3, v1, s[sgprStrideD1J] // offset 1 +v_and_b32 v0, 1, v4 // v0 = v4 % 2 +v_mul_lo_u32 v0, 0x10, v0 // wave coordination offset 0 +v_and_b32 v5, 15, v[vgprSerial] // v5 = v[vgprSerial] % 16 +v_add_lshl_u32 v0, v5, v0, 3 // coordination 0 = vwA * (wave_id0 + tid0) +s_mul_i32 s8, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v0, s8, v0 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s8, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v1, s8, v1 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 + +/******************************************/ +/* Global Write Elements */ +/******************************************/ +label_GW_B0_E0: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_lshl_u32 v11, v3, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+216], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+217], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+218], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+219], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+220], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+221], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+222], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+223], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+224], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+225], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+226], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+227], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+228], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+229], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+230], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+231], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+232], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+233], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+234], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+235], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+236], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+237], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+238], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+239], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+240], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+241], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+242], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+243], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+244], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+245], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+246], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+247], acc127 // copy acc to vreg[223] + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v8, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v9, 0x7fff0000 // fp32 Nan +v_mov_b32 v10, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v216, v[vgprValuC+216], v[vgprValuC+217] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v217, v[vgprValuC+218], v[vgprValuC+219] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v218, v[vgprValuC+220], v[vgprValuC+221] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v219, v[vgprValuC+222], v[vgprValuC+223] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v224, v[vgprValuC+224], v[vgprValuC+225] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v225, v[vgprValuC+226], v[vgprValuC+227] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v226, v[vgprValuC+228], v[vgprValuC+229] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v227, v[vgprValuC+230], v[vgprValuC+231] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v232, v[vgprValuC+232], v[vgprValuC+233] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v233, v[vgprValuC+234], v[vgprValuC+235] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v234, v[vgprValuC+236], v[vgprValuC+237] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v235, v[vgprValuC+238], v[vgprValuC+239] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v240, v[vgprValuC+240], v[vgprValuC+241] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v241, v[vgprValuC+242], v[vgprValuC+243] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v242, v[vgprValuC+244], v[vgprValuC+245] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v243, v[vgprValuC+246], v[vgprValuC+247] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[240:243], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+16], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+20], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+21], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+22], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+23], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+24], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+25], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+26], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+27], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+28], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+29], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+30], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+31], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+32], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+33], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+34], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+35], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+36], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+37], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+38], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+39], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+40], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+41], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+42], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+43], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+44], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+45], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+46], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+47], acc255 // copy acc to vreg[255] + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v8, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v9, 0x7fff0000 // fp32 Nan +v_mov_b32 v10, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End // jump to end +label_GW_End: + +s_endpgm // Kernel End +label_OptNLL_End: +label_GSU_3: + +/******************************************/ +/* Ord. NoLoadLoop - Begin */ +/******************************************/ + +/* iter 0 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:0 */ +s_waitcnt lgkmcnt(7) // wait for prior local read local write old=0, new=7 newLW=0 newLR=7 for iteration == 0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:1 */ +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:2 */ +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:3 */ +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:4 */ +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:5 */ +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:6 */ +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:7 */ +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:8 */ +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +s_waitcnt lgkmcnt(8) // wait for prior local read local write +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:9 */ +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:10 */ +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:11 */ +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:12 */ +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:13 */ +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:14 */ +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:15 */ +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:16 */ +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ + +/* iter 1 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ +label_toPGR1end_OrdNLL: +label_PrefetchGlobalLastIterEnd: + +/* Tail: add ValuA/B vgpr buffer [4...132) to pool */ + +/* Tail: add address/G2L vgpr [132...132) to pool */ +label_Summation_End_S4FDBQ587JJL6NOU: +.set sgprWGM, UNDEF +.set sgprLoopCounterL, UNDEF +.set sgprOrigLoopCounter, UNDEF +.set sgprAddressA, UNDEF +.set sgprAddressB, UNDEF +.set sgprStridesA, UNDEF +.set sgprStridesB, UNDEF +.set sgprStaggerUIter, UNDEF +.set sgprSrdA, UNDEF +.set sgprSrdB, UNDEF +.set sgprShadowLimitA, UNDEF +.set sgprShadowLimitB, UNDEF +.set sgprWrapUA, UNDEF +.set sgprWrapUB, UNDEF +.set sgprGlobalReadIncsA, UNDEF +.set sgprGlobalReadIncsB, UNDEF +.set sgprScalarGlobalReadOffsetA, UNDEF +.set sgprScalarGlobalReadOffsetB, UNDEF +/* load store sgprs */ + +/* Mapping of Acc register -> C Vgpr register */ + +/* not-LocalSplitU: global write indices */ +/* computeStoreVgprs */ +v_lshrrev_b32 v8, 6, v[vgprSerial] // 8 = Serial / 64 +v_lshrrev_b32 v9, 1, v8 // 9 = 8 / 2 +v_mul_lo_u32 v9, 0x10, v9 // wave coordination offset 1 +v_and_b32 v5, 63, v[vgprSerial] // v5 = v[vgprSerial] % 64 +v_lshrrev_b32 v5, 4, v5 // 5 = 5 / 16 +v_lshlrev_b32 v5, 2, v5 // thread0 * continuous_output +v_add_lshl_u32 v5, v9, v5, 3 // coordination 1 = vwB *(wave_id1 + tid1) +v_mul_lo_u32 v6, v5, s[sgprStrideC1J] // offset 1 +v_mul_lo_u32 v7, v5, s[sgprStrideD1J] // offset 1 +v_and_b32 v4, 1, v8 // v4 = v8 % 2 +v_mul_lo_u32 v4, 0x10, v4 // wave coordination offset 0 +v_and_b32 v9, 15, v[vgprSerial] // v9 = v[vgprSerial] % 16 +v_add_lshl_u32 v4, v9, v4, 3 // coordination 0 = vwA * (wave_id0 + tid0) +s_mul_i32 s8, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v4, s8, v4 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s8, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v5, s8, v5 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 + +/* not-LocalSplitU: global write */ + +/******************************************/ +/* Global Write Elements */ +/******************************************/ +s_and_b32 s8, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s8, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_4 // branch if GSU == 1 +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B0_E1_M // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B0_E1_N // jump if edges required +label_GW_B0_E0_1: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=26 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_lshl_u32 v15, v7, v4, 0x2 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+27], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+28], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+29], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+30], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+31], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+32], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+33], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+34], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+35], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+36], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+37], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+38], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+39], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+40], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+41], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+42], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+43], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+44], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+45], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+46], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+47], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+48], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+49], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+50], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+51], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+52], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+53], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+54], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+55], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+56], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+57], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+58], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+59], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+60], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+61], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+62], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+63], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+64], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+65], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+66], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+67], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+68], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+69], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+70], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+71], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+72], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+73], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+74], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+75], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+76], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+77], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+78], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+79], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+80], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+81], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+82], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+83], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+84], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+85], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+86], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+87], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+88], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+89], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+90], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+91], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+92], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+93], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+94], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+95], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+96], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+97], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+98], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+99], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+100], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+101], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+102], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+103], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+104], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+105], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+106], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+107], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+108], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+109], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+110], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+111], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+112], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+113], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+114], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+115], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+116], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+117], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+118], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+119], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+120], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+121], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+122], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+123], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+124], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+125], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+126], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+127], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+136], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+137], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+138], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+139], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+140], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+141], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+142], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+143], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+144], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+145], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+146], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+147], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+148], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+149], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+150], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+151], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+152], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+153], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+154], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+155], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+156], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+157], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+158], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+159], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+160], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+161], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+162], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+163], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+164], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+165], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+166], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+167], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+168], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+169], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+170], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+171], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+172], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+173], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+174], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+175], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+176], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+177], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+178], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+179], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+180], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+181], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+182], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+183], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+184], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+185], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+186], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+187], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+188], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+189], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+190], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+191], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+192], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+193], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+194], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+195], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+196], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+197], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+198], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+199], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+200], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+201], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+202], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+203], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+204], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+205], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+206], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+207], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+208], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+209], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+210], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+211], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+212], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+213], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+214], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+215], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+216], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+217], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+218], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+219], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+220], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+221], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+222], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+223], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+224], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+225], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+226], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+227], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+228], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+229], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+230], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+231], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+232], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+233], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+234], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+235], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+236], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+237], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+238], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+239], acc63 // copy acc to vreg[207] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[84:87], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[92:95], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[100:103], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[108:111], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[116:119], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[124:127], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[140:143], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[148:151], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[156:159], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[164:167], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[172:175], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[180:183], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[188:191], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[196:199], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[204:207], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[212:215], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[220:223], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[228:231], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[236:239], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+24], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+25], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+26], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+27], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+28], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+29], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+30], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+31], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+32], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+33], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+34], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+35], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+36], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+37], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+38], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+39], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+40], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+41], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+42], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+43], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+44], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+45], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+46], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+47], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+48], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+49], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+50], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+51], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+52], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+53], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+54], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+55], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+56], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+57], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+58], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+59], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+60], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+61], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+62], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+63], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+64], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+65], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+66], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+67], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+68], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+69], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+70], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+71], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ + +/* apply mask, calc new C and issue writes */ +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_B0_E1_N: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[24:27], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[32:35], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[40:43], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[48:51], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[56:59], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[64:67], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[72:75], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[80:83], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[84:87], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[88:91], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[92:95], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[96:99], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[100:103], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[104:107], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[108:111], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[112:115], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[116:119], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[120:123], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[124:127], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[136:139], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[140:143], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[144:147], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[148:151], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[152:155], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[156:159], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[160:163], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[164:167], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[168:171], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[172:175], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[176:179], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[180:183], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[184:187], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[188:191], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[192:195], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[196:199], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[200:203], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[204:207], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[208:211], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[212:215], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v81, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v81, v10, v81, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v83, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v83, v10, v83, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v84, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v85, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v85, v10, v85, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v86, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+17], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+18], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+19], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+20], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+21], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+22], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+23], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+24], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+25], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+26], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+27], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+28], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+29], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+30], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+31], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+32], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+33], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+34], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+35], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+36], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+37], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+38], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+39], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+40], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+41], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+42], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+43], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+44], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+45], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+46], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+47], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+48], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+49], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+50], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+51], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+52], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+53], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+54], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+55], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+56], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+57], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+58], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+59], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+60], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+61], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+62], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+63], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+64], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+65], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+66], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+67], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+68], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+69], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+70], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+71], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+72], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+73], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+74], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+75], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+76], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+77], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+78], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+79], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[24:27], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[32:35], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[40:43], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[48:51], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[56:59], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[64:67], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[72:75], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_B0_E1_M: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=114 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+16], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+17], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+18], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+19], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+20], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+21], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+22], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+23], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+24], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+25], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+26], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+27], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+28], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+29], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+30], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+31], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+32], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+33], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+34], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+35], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+36], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+37], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+38], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+39], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+40], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+41], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+42], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+43], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+44], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+45], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+46], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+47], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+48], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+49], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+50], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+51], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+52], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+53], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+54], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+55], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+56], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+57], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+58], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+59], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+60], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+61], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+62], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+63], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+64], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+65], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+66], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+67], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+68], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+69], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+70], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+71], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+72], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+73], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+74], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+75], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+76], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+77], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+78], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+79], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+80], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+81], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+82], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+83], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+84], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+85], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+86], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+87], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+88], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+89], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+90], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+91], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+92], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+93], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+94], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+95], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+96], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+97], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+98], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+99], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+100], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+101], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+102], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+103], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+104], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+105], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+106], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+107], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+108], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+109], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+110], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+111], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+112], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+113], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+114], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+115], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+116], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+117], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+118], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+119], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+120], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+121], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+122], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+123], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+124], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+125], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+126], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+127], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+128], acc197 // copy acc to vreg[113] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+16], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+17], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+18], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+19], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+20], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+21], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+22], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+23], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+24], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+25], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+26], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+27], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+28], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+29], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+30], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+31], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+32], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+33], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+34], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+35], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+36], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+37], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+38], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+39], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+40], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+41], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+42], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+43], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+44], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+45], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+46], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+47], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+48], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+49], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+50], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+51], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+52], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+53], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+54], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+55], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+56], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+57], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+58], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+59], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+60], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+61], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+62], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+63], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+64], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+65], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+66], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+67], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+68], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+69], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+70], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+71], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+72], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+73], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+74], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+75], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+76], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+77], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+78], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+79], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+80], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+81], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+82], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+83], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+84], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+85], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+86], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+87], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+88], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+89], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+90], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+91], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+92], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+93], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+94], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+95], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+96], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+97], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+98], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+99], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+100], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+101], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+102], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+103], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+104], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+105], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+106], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+107], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+108], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+109], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+110], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+111], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+112], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+113], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+114], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+115], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+116], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+117], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+118], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+119], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+120], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+121], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+122], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+123], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+124], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+125], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+126], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+127], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+128], acc143 // copy acc to vreg[227] + +/* rC *= alpha batchElements=[(0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v43, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v43, v10, v43, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v45, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v45, v10, v45, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v47, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v47, v10, v47, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v49, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v49, v10, v49, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v51, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v51, v10, v51, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v53, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v53, v10, v53, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v55, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v55, v10, v55, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v57, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v57, v10, v57, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v59, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v59, v10, v59, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v61, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v61, v10, v61, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v63, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v63, v10, v63, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v65, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v65, v10, v65, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v67, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v67, v10, v67, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v69, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v69, v10, v69, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+16], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+17], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+18], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+19], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+20], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+21], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+22], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+23], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+24], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+25], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+26], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+27], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+28], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+29], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+30], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+31], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+32], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+33], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+34], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+35], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+36], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+37], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+38], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+39], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+40], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+41], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+42], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v15, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v69, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_End_1: +s_getpc_b64 s[30:31] // addr of next instr +s_add_i32 s32, label_KernelEnd, 4 // target branch offset +s_add_u32 s30, s30, s32 // add target branch offset +s_addc_u32 s31, s31, 0 // add high and carry +s_setpc_b64 s[30:31] // branch to label_KernelEnd +label_GSU_4: +s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 +s_cbranch_scc0 label_GW_Beta_2 // Branch if Beta is not zero + +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B0_E1_M_1 // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B0_E1_N_1 // jump if edges required +label_GW_B0_E0_2: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=26 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_lshl_u32 v15, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+27], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+28], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+29], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+30], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+31], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+32], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+33], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+34], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+35], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+36], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+37], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+38], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+39], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+40], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+41], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+42], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+43], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+44], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+45], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+46], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+47], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+48], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+49], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+50], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+51], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+52], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+53], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+54], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+55], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+56], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+57], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+58], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+59], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+60], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+61], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+62], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+63], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+64], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+65], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+66], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+67], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+68], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+69], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+70], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+71], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+72], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+73], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+74], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+75], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+76], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+77], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+78], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+79], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+80], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+81], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+82], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+83], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+84], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+85], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+86], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+87], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+88], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+89], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+90], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+91], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+92], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+93], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+94], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+95], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+96], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+97], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+98], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+99], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+100], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+101], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+102], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+103], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+104], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+105], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+106], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+107], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+108], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+109], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+110], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+111], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+112], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+113], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+114], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+115], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+116], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+117], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+118], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+119], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+120], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+121], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+122], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+123], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+124], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+125], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+126], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+127], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+136], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+137], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+138], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+139], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+140], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+141], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+142], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+143], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+144], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+145], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+146], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+147], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+148], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+149], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+150], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+151], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+152], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+153], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+154], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+155], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+156], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+157], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+158], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+159], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+160], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+161], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+162], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+163], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+164], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+165], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+166], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+167], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+168], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+169], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+170], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+171], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+172], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+173], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+174], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+175], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+176], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+177], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+178], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+179], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+180], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+181], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+182], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+183], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+184], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+185], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+186], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+187], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+188], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+189], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+190], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+191], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+192], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+193], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+194], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+195], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+196], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+197], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+198], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+199], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+200], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+201], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+202], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+203], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+204], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+205], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+206], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+207], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+208], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+209], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+210], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+211], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+212], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+213], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+214], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+215], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+216], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+217], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+218], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+219], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+220], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+221], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+222], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+223], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+224], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+225], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+226], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+227], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+228], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+229], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+230], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+231], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+232], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+233], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+234], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+235], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+236], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+237], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+238], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+239], acc63 // copy acc to vreg[207] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+216:vgprValuC+216+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+216:vgprValuC+216+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+218:vgprValuC+218+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+218:vgprValuC+218+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+220:vgprValuC+220+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+220:vgprValuC+220+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+222:vgprValuC+222+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+222:vgprValuC+222+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+224:vgprValuC+224+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+224:vgprValuC+224+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+226:vgprValuC+226+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+226:vgprValuC+226+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+228:vgprValuC+228+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+228:vgprValuC+228+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+230:vgprValuC+230+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+230:vgprValuC+230+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+232:vgprValuC+232+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+232:vgprValuC+232+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+234:vgprValuC+234+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+234:vgprValuC+234+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+236:vgprValuC+236+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+236:vgprValuC+236+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+238:vgprValuC+238+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+238:vgprValuC+238+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v216, v[vgprValuC+216], v[vgprValuC+217] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v217, v[vgprValuC+218], v[vgprValuC+219] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v218, v[vgprValuC+220], v[vgprValuC+221] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v219, v[vgprValuC+222], v[vgprValuC+223] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v224, v[vgprValuC+224], v[vgprValuC+225] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v225, v[vgprValuC+226], v[vgprValuC+227] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v226, v[vgprValuC+228], v[vgprValuC+229] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v227, v[vgprValuC+230], v[vgprValuC+231] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v232, v[vgprValuC+232], v[vgprValuC+233] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v233, v[vgprValuC+234], v[vgprValuC+235] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v234, v[vgprValuC+236], v[vgprValuC+237] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v235, v[vgprValuC+238], v[vgprValuC+239] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+24], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+25], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+26], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+27], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+28], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+29], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+30], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+31], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+32], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+33], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+34], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+35], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+36], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+37], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+38], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+39], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+40], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+41], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+42], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+43], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+44], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+45], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+46], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+47], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+48], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+49], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+50], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+51], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+52], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+53], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+54], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+55], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+56], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+57], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+58], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+59], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+60], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+61], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+62], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+63], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+64], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+65], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+66], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+67], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+68], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+69], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+70], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+71], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B0_E1_N_1: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[80:83], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[88:91], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[96:99], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[104:107], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[112:115], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[120:123], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[136:139], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[144:147], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[152:155], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[160:163], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[168:171], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[176:179], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[184:187], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[192:195], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[200:203], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[208:211], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v81, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v81, v10, v81, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v83, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v83, v10, v83, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v84, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v85, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v85, v10, v85, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v86, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+17], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+18], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+19], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+20], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+21], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+22], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+23], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+24], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+25], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+26], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+27], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+28], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+29], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+30], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+31], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+32], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+33], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+34], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+35], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+36], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+37], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+38], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+39], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+40], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+41], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+42], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+43], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+44], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+45], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+46], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+47], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+48], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+49], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+50], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+51], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+52], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+53], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+54], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+55], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+56], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+57], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+58], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+59], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+60], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+61], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+62], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+63], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+64], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+65], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+66], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+67], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+68], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+69], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+70], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+71], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+72], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+73], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+74], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+75], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+76], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+77], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+78], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+79], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B0_E1_M_1: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=114 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+16], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+17], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+18], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+19], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+20], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+21], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+22], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+23], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+24], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+25], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+26], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+27], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+28], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+29], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+30], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+31], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+32], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+33], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+34], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+35], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+36], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+37], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+38], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+39], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+40], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+41], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+42], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+43], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+44], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+45], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+46], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+47], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+48], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+49], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+50], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+51], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+52], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+53], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+54], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+55], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+56], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+57], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+58], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+59], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+60], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+61], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+62], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+63], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+64], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+65], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+66], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+67], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+68], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+69], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+70], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+71], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+72], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+73], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+74], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+75], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+76], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+77], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+78], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+79], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+80], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+81], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+82], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+83], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+84], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+85], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+86], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+87], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+88], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+89], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+90], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+91], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+92], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+93], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+94], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+95], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+96], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+97], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+98], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+99], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+100], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+101], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+102], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+103], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+104], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+105], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+106], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+107], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+108], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+109], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+110], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+111], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+112], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+113], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+114], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+115], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+116], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+117], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+118], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+119], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+120], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+121], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+122], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+123], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+124], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+125], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+126], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+127], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+128], acc197 // copy acc to vreg[113] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+128], s[sgprAlpha], v[vgprValuC+128] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v91, v[vgprValuC+91], v[vgprValuC+91] // convert C to bf16 in gwvw==1 +buffer_store_short v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v92, v[vgprValuC+92], v[vgprValuC+92] // convert C to bf16 in gwvw==1 +buffer_store_short v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v93, v[vgprValuC+93], v[vgprValuC+93] // convert C to bf16 in gwvw==1 +buffer_store_short v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v94, v[vgprValuC+94], v[vgprValuC+94] // convert C to bf16 in gwvw==1 +buffer_store_short v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v95, v[vgprValuC+95], v[vgprValuC+95] // convert C to bf16 in gwvw==1 +buffer_store_short v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+96] // convert C to bf16 in gwvw==1 +buffer_store_short v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v97, v[vgprValuC+97], v[vgprValuC+97] // convert C to bf16 in gwvw==1 +buffer_store_short v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v98, v[vgprValuC+98], v[vgprValuC+98] // convert C to bf16 in gwvw==1 +buffer_store_short v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v99, v[vgprValuC+99], v[vgprValuC+99] // convert C to bf16 in gwvw==1 +buffer_store_short v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v100, v[vgprValuC+100], v[vgprValuC+100] // convert C to bf16 in gwvw==1 +buffer_store_short v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v101, v[vgprValuC+101], v[vgprValuC+101] // convert C to bf16 in gwvw==1 +buffer_store_short v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v102, v[vgprValuC+102], v[vgprValuC+102] // convert C to bf16 in gwvw==1 +buffer_store_short v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v103, v[vgprValuC+103], v[vgprValuC+103] // convert C to bf16 in gwvw==1 +buffer_store_short v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+104] // convert C to bf16 in gwvw==1 +buffer_store_short v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v105, v[vgprValuC+105], v[vgprValuC+105] // convert C to bf16 in gwvw==1 +buffer_store_short v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v106, v[vgprValuC+106], v[vgprValuC+106] // convert C to bf16 in gwvw==1 +buffer_store_short v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v107, v[vgprValuC+107], v[vgprValuC+107] // convert C to bf16 in gwvw==1 +buffer_store_short v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v108, v[vgprValuC+108], v[vgprValuC+108] // convert C to bf16 in gwvw==1 +buffer_store_short v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v109, v[vgprValuC+109], v[vgprValuC+109] // convert C to bf16 in gwvw==1 +buffer_store_short v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v110, v[vgprValuC+110], v[vgprValuC+110] // convert C to bf16 in gwvw==1 +buffer_store_short v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v111, v[vgprValuC+111], v[vgprValuC+111] // convert C to bf16 in gwvw==1 +buffer_store_short v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+112] // convert C to bf16 in gwvw==1 +buffer_store_short v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v113, v[vgprValuC+113], v[vgprValuC+113] // convert C to bf16 in gwvw==1 +buffer_store_short v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v114, v[vgprValuC+114], v[vgprValuC+114] // convert C to bf16 in gwvw==1 +buffer_store_short v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v115, v[vgprValuC+115], v[vgprValuC+115] // convert C to bf16 in gwvw==1 +buffer_store_short v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v116, v[vgprValuC+116], v[vgprValuC+116] // convert C to bf16 in gwvw==1 +buffer_store_short v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v117, v[vgprValuC+117], v[vgprValuC+117] // convert C to bf16 in gwvw==1 +buffer_store_short v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v118, v[vgprValuC+118], v[vgprValuC+118] // convert C to bf16 in gwvw==1 +buffer_store_short v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v119, v[vgprValuC+119], v[vgprValuC+119] // convert C to bf16 in gwvw==1 +buffer_store_short v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+120] // convert C to bf16 in gwvw==1 +buffer_store_short v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v121, v[vgprValuC+121], v[vgprValuC+121] // convert C to bf16 in gwvw==1 +buffer_store_short v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v122, v[vgprValuC+122], v[vgprValuC+122] // convert C to bf16 in gwvw==1 +buffer_store_short v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v123, v[vgprValuC+123], v[vgprValuC+123] // convert C to bf16 in gwvw==1 +buffer_store_short v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v124, v[vgprValuC+124], v[vgprValuC+124] // convert C to bf16 in gwvw==1 +buffer_store_short v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v125, v[vgprValuC+125], v[vgprValuC+125] // convert C to bf16 in gwvw==1 +buffer_store_short v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v126, v[vgprValuC+126], v[vgprValuC+126] // convert C to bf16 in gwvw==1 +buffer_store_short v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v127, v[vgprValuC+127], v[vgprValuC+127] // convert C to bf16 in gwvw==1 +buffer_store_short v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v128, v[vgprValuC+128], v[vgprValuC+128] // convert C to bf16 in gwvw==1 +buffer_store_short v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+16], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+17], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+18], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+19], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+20], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+21], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+22], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+23], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+24], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+25], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+26], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+27], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+28], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+29], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+30], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+31], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+32], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+33], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+34], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+35], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+36], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+37], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+38], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+39], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+40], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+41], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+42], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+43], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+44], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+45], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+46], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+47], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+48], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+49], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+50], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+51], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+52], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+53], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+54], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+55], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+56], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+57], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+58], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+59], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+60], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+61], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+62], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+63], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+64], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+65], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+66], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+67], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+68], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+69], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+70], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+71], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+72], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+73], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+74], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+75], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+76], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+77], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+78], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+79], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+80], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+81], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+82], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+83], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+84], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+85], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+86], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+87], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+88], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+89], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+90], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+91], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+92], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+93], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+94], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+95], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+96], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+97], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+98], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+99], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+100], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+101], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+102], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+103], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+104], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+105], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+106], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+107], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+108], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+109], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+110], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+111], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+112], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+113], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+114], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+115], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+116], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+117], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+118], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+119], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+120], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+121], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+122], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+123], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+124], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+125], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+126], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+127], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+128], acc143 // copy acc to vreg[227] + +/* rC *= alpha batchElements=[(0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+128], s[sgprAlpha], v[vgprValuC+128] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v91, v[vgprValuC+91], v[vgprValuC+91] // convert C to bf16 in gwvw==1 +buffer_store_short v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v92, v[vgprValuC+92], v[vgprValuC+92] // convert C to bf16 in gwvw==1 +buffer_store_short v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v93, v[vgprValuC+93], v[vgprValuC+93] // convert C to bf16 in gwvw==1 +buffer_store_short v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v94, v[vgprValuC+94], v[vgprValuC+94] // convert C to bf16 in gwvw==1 +buffer_store_short v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v95, v[vgprValuC+95], v[vgprValuC+95] // convert C to bf16 in gwvw==1 +buffer_store_short v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+96] // convert C to bf16 in gwvw==1 +buffer_store_short v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v97, v[vgprValuC+97], v[vgprValuC+97] // convert C to bf16 in gwvw==1 +buffer_store_short v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v98, v[vgprValuC+98], v[vgprValuC+98] // convert C to bf16 in gwvw==1 +buffer_store_short v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v99, v[vgprValuC+99], v[vgprValuC+99] // convert C to bf16 in gwvw==1 +buffer_store_short v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v100, v[vgprValuC+100], v[vgprValuC+100] // convert C to bf16 in gwvw==1 +buffer_store_short v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v101, v[vgprValuC+101], v[vgprValuC+101] // convert C to bf16 in gwvw==1 +buffer_store_short v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v102, v[vgprValuC+102], v[vgprValuC+102] // convert C to bf16 in gwvw==1 +buffer_store_short v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v103, v[vgprValuC+103], v[vgprValuC+103] // convert C to bf16 in gwvw==1 +buffer_store_short v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+104] // convert C to bf16 in gwvw==1 +buffer_store_short v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v105, v[vgprValuC+105], v[vgprValuC+105] // convert C to bf16 in gwvw==1 +buffer_store_short v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v106, v[vgprValuC+106], v[vgprValuC+106] // convert C to bf16 in gwvw==1 +buffer_store_short v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v107, v[vgprValuC+107], v[vgprValuC+107] // convert C to bf16 in gwvw==1 +buffer_store_short v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v108, v[vgprValuC+108], v[vgprValuC+108] // convert C to bf16 in gwvw==1 +buffer_store_short v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v109, v[vgprValuC+109], v[vgprValuC+109] // convert C to bf16 in gwvw==1 +buffer_store_short v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v110, v[vgprValuC+110], v[vgprValuC+110] // convert C to bf16 in gwvw==1 +buffer_store_short v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v111, v[vgprValuC+111], v[vgprValuC+111] // convert C to bf16 in gwvw==1 +buffer_store_short v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+112] // convert C to bf16 in gwvw==1 +buffer_store_short v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v113, v[vgprValuC+113], v[vgprValuC+113] // convert C to bf16 in gwvw==1 +buffer_store_short v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v114, v[vgprValuC+114], v[vgprValuC+114] // convert C to bf16 in gwvw==1 +buffer_store_short v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v115, v[vgprValuC+115], v[vgprValuC+115] // convert C to bf16 in gwvw==1 +buffer_store_short v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v116, v[vgprValuC+116], v[vgprValuC+116] // convert C to bf16 in gwvw==1 +buffer_store_short v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v117, v[vgprValuC+117], v[vgprValuC+117] // convert C to bf16 in gwvw==1 +buffer_store_short v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v118, v[vgprValuC+118], v[vgprValuC+118] // convert C to bf16 in gwvw==1 +buffer_store_short v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v119, v[vgprValuC+119], v[vgprValuC+119] // convert C to bf16 in gwvw==1 +buffer_store_short v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+120] // convert C to bf16 in gwvw==1 +buffer_store_short v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v121, v[vgprValuC+121], v[vgprValuC+121] // convert C to bf16 in gwvw==1 +buffer_store_short v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v122, v[vgprValuC+122], v[vgprValuC+122] // convert C to bf16 in gwvw==1 +buffer_store_short v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v123, v[vgprValuC+123], v[vgprValuC+123] // convert C to bf16 in gwvw==1 +buffer_store_short v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v124, v[vgprValuC+124], v[vgprValuC+124] // convert C to bf16 in gwvw==1 +buffer_store_short v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v125, v[vgprValuC+125], v[vgprValuC+125] // convert C to bf16 in gwvw==1 +buffer_store_short v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v126, v[vgprValuC+126], v[vgprValuC+126] // convert C to bf16 in gwvw==1 +buffer_store_short v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v127, v[vgprValuC+127], v[vgprValuC+127] // convert C to bf16 in gwvw==1 +buffer_store_short v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v128, v[vgprValuC+128], v[vgprValuC+128] // convert C to bf16 in gwvw==1 +buffer_store_short v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v43, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v43, v10, v43, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v45, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v45, v10, v45, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v47, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v47, v10, v47, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v49, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v49, v10, v49, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v51, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v51, v10, v51, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v53, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v53, v10, v53, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v55, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v55, v10, v55, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v57, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v57, v10, v57, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v59, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v59, v10, v59, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v61, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v61, v10, v61, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v63, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v63, v10, v63, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v65, v10, v65, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v67, v10, v67, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v69, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v69, v10, v69, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+16], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+17], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+18], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+19], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+20], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+21], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+22], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+23], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+24], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+25], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+26], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+27], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+28], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+29], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+30], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+31], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+32], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+33], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+34], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+35], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+36], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+37], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+38], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+39], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+40], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+41], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+42], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+42], s[sgprAlpha], v[vgprValuC+42] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v69, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_Beta_2: +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required +label_GW_B1_E0: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=18 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_add_lshl_u32 v16, v6, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +buffer_load_dwordx4 v[20:23], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[128:131], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[176:179], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[180:183], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[184:187], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[188:191], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[192:195], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[196:199], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[200:203], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[204:207], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[208:211], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[212:215], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[216:219], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[220:223], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[224:227], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[228:231], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[232:235], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[236:239], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v15, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+27], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+28], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+29], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+30], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+31], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+32], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+33], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+34], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+35], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+36], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+37], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+38], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+39], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+40], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+41], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+42], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+43], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+44], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+45], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+46], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+47], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+48], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+49], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+50], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+51], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+52], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+53], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+54], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+55], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+56], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+57], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+58], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+59], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+60], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+61], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+62], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+63], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+64], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+65], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+66], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+67], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+68], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+69], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+70], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+71], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+72], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+73], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+74], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+75], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+76], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+77], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+78], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+79], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+80], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+81], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+82], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+83], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+84], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+85], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+86], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+87], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+88], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+89], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+90], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+91], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+92], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+93], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+94], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+95], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+96], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+97], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+98], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+99], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+100], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+101], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+102], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+103], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+104], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+105], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+106], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+107], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+108], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+109], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+110], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+111], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+112], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+113], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+114], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+115], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+116], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+117], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+118], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+119], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+120], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+121], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+122], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+123], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+124], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+125], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+126], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+127], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+136], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+137], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+138], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+139], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+140], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+141], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+142], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+143], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+144], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+145], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+146], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+147], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+148], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+149], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+150], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+151], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+152], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+153], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+154], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+155], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+156], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+157], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+158], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+159], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+160], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+161], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+162], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+163], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+164], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+165], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+166], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+167], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+168], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+169], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+170], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+171], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+172], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+173], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+174], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+175], acc62 // copy acc to vreg[143] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 + +s_waitcnt vmcnt(17) // vmcnt(17) = 18 - 1 (beta) (interleaved) +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(16) = 18 - 2 (beta) (interleaved) +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(15) = 18 - 3 (beta) (interleaved) +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(14) = 18 - 4 (beta) (interleaved) +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(13) = 18 - 5 (beta) (interleaved) +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(12) = 18 - 6 (beta) (interleaved) +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(11) = 18 - 7 (beta) (interleaved) +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(10) = 18 - 8 (beta) (interleaved) +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(9) = 18 - 9 (beta) (interleaved) +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v201 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v201 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v203 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v203 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(8) = 18 - 10 (beta) (interleaved) +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(7) = 18 - 11 (beta) (interleaved) +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(6) = 18 - 12 (beta) (interleaved) +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(5) = 18 - 13 (beta) (interleaved) +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(4) = 18 - 14 (beta) (interleaved) +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v221 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v221 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v223 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v223 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(3) = 18 - 15 (beta) (interleaved) +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+144], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+145], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+146], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+147], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+148], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+149], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+150], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+151], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(2) = 18 - 16 (beta) (interleaved) +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+152], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+153], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v229 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+154], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v229 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+155], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+156], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+157], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v231 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+158], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v231 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+159], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(1) = 18 - 17 (beta) (interleaved) +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+160], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+161], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v233 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+162], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v233 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+163], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+164], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+165], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v235 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+166], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v235 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+167], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(0) = 18 - 18 (beta) (interleaved) +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+168], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+169], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v237 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+170], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v237 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+171], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+172], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+173], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v239 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+174], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v239 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+175], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[20:23], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[128:131], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[144:147], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[148:151], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[152:155], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[156:159], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[160:163], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[164:167], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[168:171], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[172:175], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[176:179], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[180:183], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[184:187], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[188:191], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+25], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+26], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+27], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+28], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+29], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+30], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+31], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+32], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+33], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+34], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+35], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+36], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+37], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+38], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+39], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+40], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+41], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+42], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+43], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+44], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+45], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+46], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+47], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+48], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+49], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+50], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+51], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+52], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+53], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+54], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+55], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+56], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+57], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+58], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+59], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+60], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+61], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+62], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+63], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+64], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+65], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+66], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+67], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+68], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+69], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+70], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+71], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+72], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+73], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+74], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+75], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+76], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+77], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+78], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+79], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+80], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+81], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+82], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+83], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+84], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+85], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+86], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+87], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+88], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+89], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+90], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+91], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+92], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+93], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+94], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+95], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+96], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+97], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+98], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+99], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+100], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+101], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+102], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+103], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+104], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+105], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+106], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+107], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+108], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+109], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+110], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+111], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+112], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+113], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+114], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+115], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+116], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+117], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+118], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+119], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+120], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+121], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+122], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+123], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+124], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+125], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+126], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+127], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+136], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+137], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+138], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+139], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+140], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+141], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+142], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+143], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 + +s_waitcnt vmcnt(13) // vmcnt(13) = 14 - 1 (beta) (interleaved) +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v20 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v21 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v22 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v23 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(12) = 14 - 2 (beta) (interleaved) +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(11) = 14 - 3 (beta) (interleaved) +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v145 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v147 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v147 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(10) = 14 - 4 (beta) (interleaved) +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v149 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v149 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v151 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(9) = 14 - 5 (beta) (interleaved) +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(8) = 14 - 6 (beta) (interleaved) +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(7) = 14 - 7 (beta) (interleaved) +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v161 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v161 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v163 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(6) = 14 - 8 (beta) (interleaved) +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(5) = 14 - 9 (beta) (interleaved) +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(4) = 14 - 10 (beta) (interleaved) +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(3) = 14 - 11 (beta) (interleaved) +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(2) = 14 - 12 (beta) (interleaved) +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v181 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v183 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(1) = 14 - 13 (beta) (interleaved) +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(0) = 14 - 14 (beta) (interleaved) +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B1_E1_N: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=16 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[128:131], v15, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[152:155], v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[156:159], v160, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[164:167], v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[168:171], v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[172:175], v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[176:179], v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v180, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[184:187], v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[188:191], v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v182, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[192:195], v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[196:199], v200, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v200, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[204:207], v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[208:211], v202, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[212:215], v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[216:219], v220, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[224:227], v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[80:83], v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[88:91], v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[96:99], v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[104:107], v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[112:115], v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[120:123], v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[136:139], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+144], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+145], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+146], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+147], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+148], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+149], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+150], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+151], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[144:147], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[128:131], v15, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[152:155], v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[156:159], v160, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[164:167], v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[168:171], v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[172:175], v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[176:179], v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v180, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[184:187], v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[188:191], v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v182, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[192:195], v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[196:199], v200, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v200, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[204:207], v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[208:211], v202, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[212:215], v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[216:219], v220, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[224:227], v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+17], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+18], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+19], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+20], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+21], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+22], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+23], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+24], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+25], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+26], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+27], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+28], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+29], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+30], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+31], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+32], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+33], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+34], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+35], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+36], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+37], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+38], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+39], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+40], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+41], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+42], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+43], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+44], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+45], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+46], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+47], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+48], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+49], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+50], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+51], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+52], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+53], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+54], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+55], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+56], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+57], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+58], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+59], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+60], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+61], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+62], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+63], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+64], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+65], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+66], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+67], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+68], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+69], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+70], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+71], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+72], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+73], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+74], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+75], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+76], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+77], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+78], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+79], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+80], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+81], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+82], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+83], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+84], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+85], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+86], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+87], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+88], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+89], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+90], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+91], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+92], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+93], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+94], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+95], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+96], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+97], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+98], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+99], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+100], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+101], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+102], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+103], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+104], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+105], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+106], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+107], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+108], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+109], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+110], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+111], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+112], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+113], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+114], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+115], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+116], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+117], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+118], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+119], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+120], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+121], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+122], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+123], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+124], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+125], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+126], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+127], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+136], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+137], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+138], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+139], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+140], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+141], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+142], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+143], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+144], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+145], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+146], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+147], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+148], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+149], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+150], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+151], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v153 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v155 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[24:27], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v157 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v159 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[32:35], v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v165 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v167 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[40:43], v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v169 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v171 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[48:51], v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v173 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v175 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[56:59], v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[64:67], v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[72:75], v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[80:83], v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[88:91], v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+100], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+101], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+102], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+103], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[96:99], v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+104], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+105], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+106], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+107], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+108], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+109], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+110], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v207 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+111], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[104:107], v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+112], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+113], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+114], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v209 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+115], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+116], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+117], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+118], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v211 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+119], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[112:115], v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+120], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+121], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+122], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v213 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+123], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+124], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+125], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+126], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v215 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+127], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[120:123], v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+136], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+137], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+138], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v217 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+139], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+140], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+141], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+142], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v219 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+143], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[136:139], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+144], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+145], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+146], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v225 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+147], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+148], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+149], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+150], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_f32_bf16 v8, v227 src0_sel:WORD_1 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+151], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor +v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor +buffer_store_dwordx4 v[144:147], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B1_E1_M: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=76 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+16], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+17], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+18], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+19], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+20], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+21], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+22], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+23], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+24], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+25], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+26], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+27], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+28], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+29], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+30], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+31], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+32], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+33], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+34], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+35], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+36], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+37], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+38], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+39], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+40], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+41], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+42], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+43], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+44], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+45], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+46], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+47], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+48], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+49], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+50], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+51], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+52], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+53], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+54], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+55], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+56], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+57], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+58], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+59], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+60], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+61], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+62], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+63], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+64], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+65], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+66], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+67], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+68], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+69], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+70], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+71], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+72], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+73], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+74], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+75], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+76], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+77], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+78], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+79], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+80], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+81], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+82], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+83], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+84], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+85], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+86], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+87], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+88], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+89], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+90], acc45 // copy acc to vreg[75] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v99 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v101 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v105 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v107 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v111 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v113 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v117 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v119 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v123 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v125 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v138 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v140 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v240 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v242 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v244 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+16], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+17], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+18], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+19], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+20], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+21], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+22], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+23], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+24], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+25], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+26], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+27], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+28], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+29], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+30], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+31], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+32], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+33], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+34], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+35], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+36], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+37], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+38], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+39], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+40], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+41], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+42], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+43], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+44], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+45], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+46], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+47], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+48], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+49], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+50], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+51], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+52], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+53], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+54], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+55], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+56], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+57], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+58], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+59], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+60], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+61], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+62], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+63], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+64], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+65], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+66], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+67], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+68], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+69], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+70], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+71], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+72], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+73], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+74], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+75], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+76], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+77], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+78], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+79], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+80], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+81], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+82], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+83], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+84], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+85], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+86], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+87], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+88], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+89], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+90], acc94 // copy acc to vreg[151] + +/* rC *= alpha batchElements=[(0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v99 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v101 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v105 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v107 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v111 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v113 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v117 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v119 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v123 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v125 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v138 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v140 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v240 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v242 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v244 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+16], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+17], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+18], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+19], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+20], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+21], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+22], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+23], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+24], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+25], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+26], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+27], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+28], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+29], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+30], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+31], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+32], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+33], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+34], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+35], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+36], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+37], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+38], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+39], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+40], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+41], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+42], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+43], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+44], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+45], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+46], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+47], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+48], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+49], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+50], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+51], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+52], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+53], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+54], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+55], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+56], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+57], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+58], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+59], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+60], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+61], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+62], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+63], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+64], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+65], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+66], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+67], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+68], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+69], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+70], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+71], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+72], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+73], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+74], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+75], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+76], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+77], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+78], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+79], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+80], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+81], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+82], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+83], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+84], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+85], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+86], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+87], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+88], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+89], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+90], acc143 // copy acc to vreg[227] + +/* rC *= alpha batchElements=[(0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v99 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v101 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v105 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v107 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v111 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v113 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v117 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v119 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v123 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v125 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v138 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v140 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v144 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v146 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v150 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 +buffer_store_short v43, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v152 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 +buffer_store_short v44, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 +buffer_store_short v45, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v156 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 +buffer_store_short v46, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v158 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 +buffer_store_short v47, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 +buffer_store_short v48, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v162 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 +buffer_store_short v49, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v164 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 +buffer_store_short v50, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 +buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v168 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 +buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v170 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 +buffer_store_short v53, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v174 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 +buffer_store_short v55, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 +buffer_store_short v56, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 +buffer_store_short v57, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 +buffer_store_short v58, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 +buffer_store_short v59, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 +buffer_store_short v60, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 +buffer_store_short v61, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 +buffer_store_short v62, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 +buffer_store_short v63, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 +buffer_store_short v64, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1 +buffer_store_short v65, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1 +buffer_store_short v66, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1 +buffer_store_short v67, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1 +buffer_store_short v68, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1 +buffer_store_short v69, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1 +buffer_store_short v70, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v206 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1 +buffer_store_short v71, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v208 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1 +buffer_store_short v72, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v210 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1 +buffer_store_short v73, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v212 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1 +buffer_store_short v74, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v214 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1 +buffer_store_short v75, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v216 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1 +buffer_store_short v76, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v218 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1 +buffer_store_short v77, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v220 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1 +buffer_store_short v78, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v222 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1 +buffer_store_short v79, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v224 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1 +buffer_store_short v80, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v226 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1 +buffer_store_short v81, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v228 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1 +buffer_store_short v82, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v230 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1 +buffer_store_short v83, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v232 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1 +buffer_store_short v84, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v234 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1 +buffer_store_short v85, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v236 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1 +buffer_store_short v86, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v238 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1 +buffer_store_short v87, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v240 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1 +buffer_store_short v88, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v242 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1 +buffer_store_short v89, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v244 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1 +buffer_store_short v90, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */ +/* (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v43, v44, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v44, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v45, v46, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v46, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v47, v48, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v49, v50, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v50, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v51, v52, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v52, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v53, v54, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v54, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v55, v56, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v56, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v57, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v59, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v61, v62, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v62, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v63, v64, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v64, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v65, v66, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v66, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v67, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v68, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v69, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v72, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v72, v10, v72, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v71, v72, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v72, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v72, v10, v72, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v74, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v74, v10, v74, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v73, v74, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v74, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v74, v10, v74, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v76, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v76, v10, v76, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v75, v76, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v76, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v76, v10, v76, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v78, v10, v78, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v77, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v78, v10, v78, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v79, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v80, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v81, v82, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v82, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v84, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v83, v84, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v84, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v84, v10, v84, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v86, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v85, v86, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v86, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v86, v10, v86, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v88, v10, v88, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v87, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v88, v10, v88, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v89, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+15], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+16], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+17], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+18], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+19], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+20], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+21], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+22], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+23], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+24], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+25], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+26], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+27], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+28], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+29], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+30], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+31], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+32], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+33], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+34], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+35], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+36], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+37], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+38], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+39], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+40], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+41], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+42], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ +v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+42], s[sgprAlpha], v[vgprValuC+42] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_mov_b32 v12, 0xffff0000 // mask for pack two bfloat16 element to 32bit +v_mov_b32 v13, 0x7fff0000 // fp32 Nan +v_mov_b32 v14, 0x7fff // rounding bias for bfloat16 +v_cvt_f32_bf16 v8, v43 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+15], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1 +buffer_store_short v15, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v45 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+16], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1 +buffer_store_short v16, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v47 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+17], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 +buffer_store_short v17, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v49 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+18], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 +buffer_store_short v18, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v51 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+19], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 +buffer_store_short v19, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v53 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+20], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 +buffer_store_short v20, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v55 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 +buffer_store_short v21, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 +buffer_store_short v22, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v59 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 +buffer_store_short v23, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v61 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 +buffer_store_short v24, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v63 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 +buffer_store_short v25, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v65 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 +buffer_store_short v26, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 +buffer_store_short v27, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v69 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 +buffer_store_short v28, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v71 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 +buffer_store_short v29, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v73 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 +buffer_store_short v30, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v75 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 +buffer_store_short v31, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 +buffer_store_short v32, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v79 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 +buffer_store_short v33, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v81 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 +buffer_store_short v34, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v83 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 +buffer_store_short v35, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v85 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 +buffer_store_short v36, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 +buffer_store_short v37, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v89 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 +buffer_store_short v38, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v91 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 +buffer_store_short v39, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v93 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 +buffer_store_short v40, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v95 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 +buffer_store_short v41, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 +v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta +v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 +buffer_store_short v42, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_End_2: +label_KernelEnd: +s_endpgm // Kernel End +label_ASM_End: /// The end of the kernel diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.s new file mode 100644 index 00000000000..d61cd02a7c9 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.s @@ -0,0 +1,20086 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx950" +.text +.protected Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 +.globl Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 +.p2align 8 +.type Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_accum_offset 248 // accvgpr offset + .amdhsa_next_free_vgpr 504 // vgprs + .amdhsa_next_free_sgpr 88 // sgprs + .amdhsa_group_segment_fixed_size 133120 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_user_sgpr_count 13 + .amdhsa_user_sgpr_kernarg_preload_length 11 + .amdhsa_user_sgpr_kernarg_preload_offset 0 +.end_amdhsa_kernel +.text +/* Num VGPR =248 */ +/* Num AccVGPR=256 */ +/* Num SGPR =88 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 32 x 8 */ +/* SubGroup= 8 x 32 */ +/* VectorWidthA=8 */ +/* VectorWidthB=8 */ +/* GlobalReadVectorWidthA=8, GlobalReadVectorWidthB=8 */ +/* DirectToLdsA=True */ +/* DirectToLdsB=True */ +/* UseSgprForGRO=1 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 + ProblemType: + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: 1 + TransposeB: 0 + UseBeta: True + Batched: True + Activation: False +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950 + .symbol: 'Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + .group_segment_fixed_size: 133120 + .kernarg_segment_align: 8 + .kernarg_segment_size: 104 + .max_flat_workgroup_size: 256 + .private_segment_fixed_size: 0 + .sgpr_count: 88 + .sgpr_spill_count: 0 + .vgpr_count: 248 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Custom_Cijk_Alik_Bljk_HHS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname0_gfx950: +label_ASM_Start: /// Main body of the asm kernel +.macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req + v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber + v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA + v_add_u32 v[\vgprDstIdx+0], v[\vgprDstIdx+0], v[\vgprDstIdx+1] + v_lshrrev_b32 v[\vgprDstIdx+0], \magicShift, v[\vgprDstIdx+0] +.endm + +/******************************************/ +/* VGPR Assignments */ +/******************************************/ +/* ValuC range: [0-0), serializedStore enabled */ +.set vgprValuC, 0 +/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ +.set vgprBase, 4 +.set vgprGlobalReadOffsetA, 0 +.set vgprGlobalReadOffsetB, 1 +.set vgprLocalReadAddrA, 2 +.set vgprLocalReadAddrB, 3 +.set vgprLocalReadSwapAddrA, 132 +.set vgprLocalReadSwapAddrB, 133 +.set vgprSerial, 134 + +/******************************************/ +/* VGPR Macro Assignments */ +/******************************************/ +.set vgprValuA_X0_I0_BASE, vgprBase+0 +.set vgprValuB_X0_I0_BASE, vgprBase+64 +.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0 +.set vgprValuA_X1_I0, vgprValuA_X0_I0_BASE+32 +.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0 +.set vgprValuB_X1_I0, vgprValuB_X0_I0_BASE+32 + +/******************************************/ +/* SGPR Assignments */ +/******************************************/ +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprWGM, 11 +.set sgprLoopCounterL, 12 +.set sgprOrigLoopCounter, 13 +.set sgprSrdD, 16 +.set sgprSrdC, 20 +.set sgprNumWorkGroups0, 14 +.set sgprNumWorkGroups1, 15 +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprLocalWriteAddrA, 46 +.set sgprLocalWriteAddrB, 47 +.set sgprSwapA, 48 +.set sgprSwapB, 49 +.set sgprGSU, 50 + +/* Size Assignments */ +.set sgprSizeI, sgprSizesFree+0 +.set sgprSizeJ, sgprSizesFree+1 +.set sgprSizeK, sgprSizesFree+2 +.set sgprSizeL, sgprSizesSum+0 + +/* Stride Assignments */ +.set constStrideD0I, 1 +.set sgprStrideD1J, sgprStridesD+0 +.set sgprStrideDK, sgprStridesD+1 +.set constStrideC0I, 1 +.set sgprStrideC1J, sgprStridesC+0 +.set sgprStrideCK, sgprStridesC+1 +.set constStrideAL, 1 +.set sgprStrideA0I, sgprStridesA+0 +.set sgprStrideAK, sgprStridesA+1 +.set constStrideBL, 1 +.set sgprStrideB1J, sgprStridesB+0 +.set sgprStrideBK, sgprStridesB+1 + +.set MT0, 256 +.set MT1, 256 +.set DepthU, 64 +.set BpeA, 2 +.set BpeALog2, 1 +.set BpeB, 2 +.set BpeBLog2, 1 +.set BpeAGR, 2 +.set BpeAGRLog2, 1 +.set BpeBGR, 2 +.set BpeBGRLog2, 1 +/* Number of elements to shift-left SRD */ +.set SrdShiftLeftA, 8 +.set SrdShiftLeftB, 8 +/* 2GB limit - set offsets to -1 to exceed this and clamp */ +.set BufferLimit, 0xffffffff +.set BufferOOB, 0x80000000 + +/******************************************/ +/* Bits 127:96 of SRD. */ +/* hex: 0x20000 */ +/* dst_sel_x (3b): 0 */ +/* dst_sel_y (3b): 0 */ +/* dst_sel_z (3b): 0 */ +/* dst_sel_w (3b): 0 */ +/* num_format (3b): 0 */ +/* data_format (4b): 4 */ +/* user_vm_enable (1b): 0 */ +/* user_vm_mode (1b): 0 */ +/* index_stride (2b): 0 */ +/* add_tid_enable (1b): 0 */ +/* _unusedA (3b): 0 */ +/* nv (1b): 0 */ +/* _unusedB (2b): 0 */ +/* type (2b): 0 */ +/******************************************/ +.set Srd127_96, 0x20000 + +/* Global Offset A */ +.macro GLOBAL_OFFSET_A vgprAddr:req, vgprOffsetL:req, vgprOffset0I:req, vgprTmp:req + v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower + v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower + v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0] // add prepad for pointer shift + v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0] // offset *= bytes/element +.endm + +/* Global Offset B */ +.macro GLOBAL_OFFSET_B vgprAddr:req, vgprOffsetL:req, vgprOffset1J:req, vgprTmp:req + v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower + v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower + v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0] // add prepad for pointer shift + v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0] // offset *= bytes/element +.endm + +/******************************************/ +/* Allocate Resources */ +/******************************************/ + +/* Load num of Gemms */ +s_load_dword s51, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 + +/* Load packed kernel args (StaggerU/GSU) */ +s_load_dword s53, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 + +/* Load num of WGs */ +s_load_dword s54, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_waitcnt lgkmcnt(0) // load args +s_lshr_b32 s52, s51, 0x1e // Get arg type +s_and_b32 s51, 0x3fffffff, s51 // Get nums of gemm +s_cmp_eq_u32 s52, 0 // Is kernel args +s_cbranch_scc0 label_HBMArgs +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_waitcnt lgkmcnt(0) // preload +s_branch label_LoadArgsEnd +label_HBMArgs: + +/* Load address of kernel arguments */ +s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 +s_waitcnt lgkmcnt(0) // wait for args to load +label_LoadArgsEnd: +s_branch label_common_kernel_entry + +/* pad 37 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +label_Preload_Offset_Start: +s_and_b32 s51, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s52, s2, 0x1e // Get arg type +s_mov_b32 s53, s3 // Preload internal args +s_cmp_eq_u32 s52, 0 // Is kernel args +s_cbranch_scc0 label_Preload_HBMArgs +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dword s31, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 +s_load_dwordx8 s[32:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_mov_b64 s[24:25], s[6:7] // move preload data to correct sgpr +s_mov_b64 s[26:27], s[8:9] // move preload data to correct sgpr +s_mov_b64 s[28:29], s[10:11] // move preload data to correct sgpr +s_mov_b32 s30, s12 // move preload data to correct sgpr +s_branch label_Preload_LoadArgsEnd +label_Preload_HBMArgs: +s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments +label_Preload_LoadArgsEnd: +s_mov_b32 s[sgprWGM], s4 // Preload internal args2 +s_mov_b32 s54, s5 // Load num of WGs +label_common_kernel_entry: /// for both preload/non-preload common code +s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id +s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id +s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id +s_and_b32 s[sgprStaggerU], s53, 0xffff0000 // Restore StaggerU related vars +s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 +s_and_b32 s[sgprGSU], s53, 0xffff // Restore GSUConfig and GSU +s_mov_b32 s[sgprArgType], s52 +s_mov_b32 m0, 0x20800 // LDS clamp at 133120 bytes +v_mov_b32 v[vgprSerial], v0 // thread serial id + +/* remap workgroup to XCCs */ +s_lshr_b32 s60, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s60, s60 // Get log(WGMXCC) +s_lshr_b32 s61, s[sgprWGM], 0x16 // Get CU_Count +/* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ +s_cmp_gt_i32 s60, 0 +s_cbranch_scc0 label_skip_WGMXCC +/* only remap WGs in the range */ +s_lshr_b32 s57, s54, s60 +s_lshl_b32 s57, s57, s60 +s_cmp_ge_u32 s[sgprWorkGroup0], s57 +s_cbranch_scc1 label_skip_WGMXCC +s_cmp_eq_u32 s61, 0 // CU_Count == 0 ? +s_cbranch_scc0 label_XCCG_nonzero +s_lshr_b32 s57, s[sgprWorkGroup0], s60 +s_bfm_b32 s58, s60, 0 +s_and_b32 s58, s[sgprWorkGroup0], s58 +s_lshr_b32 s59, s54, s60 +s_mul_i32 s58, s58, s59 +s_add_u32 s[sgprWorkGroup0], s57, s58 +s_branch label_skip_WGMXCC +label_XCCG_nonzero: +/* temp0 = (wg//CU_Count)*CU_Count */ +v_cvt_f32_u32 v10, s61 // wg//CU_Count +v_rcp_iflag_f32 v10, v10 // wg//CU_Count +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // wg//CU_Count +v_mul_f32 v10, v10, v11 // wg//CU_Count +v_cvt_u32_f32 v10, v10 // wg//CU_Count +v_mul_u32_u24 v11, v10, s61 // wg//CU_Count +v_sub_u32 v11, s[sgprWorkGroup0], v11 // wg//CU_Count +v_cmpx_eq_u32 exec, v11, s61 // wg//CU_Count +v_add_u32 v10, 1, v10 // wg//CU_Count +v_mov_b32 v11, 0 // wg//CU_Count +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s61 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s61 // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup0], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s57, v10 // quotient +v_readfirstlane_b32 s58, v11 // remainder +s_mul_i32 s57, s57, s61 +/* temp1 = (wg%CU_Count)//WGMXCC */ +s_lshr_b32 s58, s58, s60 +/* temp0 = temp0 + temp1 */ +s_add_u32 s57, s57, s58 +/* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ +v_cvt_f32_u32 v10, s61 // WGs//CU_Count +v_rcp_iflag_f32 v10, v10 // WGs//CU_Count +v_cvt_f32_u32 v11, s54 // WGs//CU_Count +v_mul_f32 v10, v10, v11 // WGs//CU_Count +v_cvt_u32_f32 v10, v10 // WGs//CU_Count +v_mul_u32_u24 v11, v10, s61 // WGs//CU_Count +v_sub_u32 v11, s54, v11 // WGs//CU_Count +v_cmpx_eq_u32 exec, v11, s61 // WGs//CU_Count +v_add_u32 v10, 1, v10 // WGs//CU_Count +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s61 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s58, v10 // quotient +s_mul_i32 s58, s58, s61 +s_sub_u32 s59, s54, s58 +s_cmp_gt_u32 s[sgprWorkGroup0], s58 +s_cselect_b32 s58, s59, s61 +s_lshr_b32 s58, s58, s60 +s_bfm_b32 s59, s60, 0 +s_and_b32 s59, s[sgprWorkGroup0], s59 +s_mul_i32 s58, s58, s59 +/* WorkGroup0 = temp0 + temp1 */ +s_add_u32 s[sgprWorkGroup0], s57, s58 +label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap +s_cmp_eq_u32 s52, 0 +s_cbranch_scc0 label_MultiGemm +/* init: add vgpr [4...136) to pool */ +/* init: add vgpr [0...0) to pool */ +/* init: add agpr [0...256) to pool */ + +/******************************************/ +/* Local Read Addresses */ +/******************************************/ + +/* local read addresses: tile assignments a/b */ +/* lr0I */ +v_and_b32 v5, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v4, 15, v5 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v4, 6, v4 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v4, 3, v4 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v5, 4, v5 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v4, v5, 3, v4 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v8, 6, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64) +v_and_b32 v8, 1, v8 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v4, v8, 13, v4 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset +/* lr1J */ +v_and_b32 v6, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v5, 15, v6 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v5, 6, v5 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v5, 3, v5 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v6, 4, v6 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v5, v6, 3, v5 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v7, 7, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128) +v_and_b32 v7, 1, v7 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v5, v7, 13, v5 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset + +/* local read addresses: final offsets a */ +v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 +v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id +s_mov_b32 s53, 64 // LSU offset: stride = lsuStride(64) when umlds==True +v_mul_lo_u32 v6, s53, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1 // Final Offset: offset = (lro0+lsuoffset)*bpeDS +v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 + +/* local read addresses: final offsets b */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id + // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.) +v_mul_lo_u32 v4, s53, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1 // Final Offset: offset = (lro1+lsuoffset)*bpeDS +v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 + +/* local read addresses: declare addresses a */ +/* N/A */ + +/* local read addresses: declare addresses b */ +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8200, v[vgprLocalReadAddrB+0] // += LdsOffsetB (lower) +v_add_u32 v[vgprLocalReadSwapAddrA], 66560, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping +v_add_u32 v[vgprLocalReadSwapAddrB], 66560, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping + +/******************************************/ +/* Local Write Addresses */ +/******************************************/ +/* LVCA = 8 */ +/* v5 = A-unroll = serial%LVCA */ +v_lshrrev_b32 v4, 3, v[vgprSerial] // 4 = Serial / 8 +v_and_b32 v5, 7, v[vgprSerial] // 5 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v5, 3, v5 // v5 = v5 * 8 +v_mov_b32 v8, v5 // copy for GlobalSplitU +/* LVCB = 8 */ +/* v7 = B-unroll = serial%LVCB */ +v_lshrrev_b32 v6, 3, v[vgprSerial] // 6 = Serial / 8 +v_and_b32 v7, 7, v[vgprSerial] // 7 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v7, 3, v7 // v7 = v7 * 8 +v_mov_b32 v9, v7 // copy for GlobalSplitU +/* lwaUnrollAssignmentA = v8 */ +/* lwaUnrollAssignmentB = v9 */ + +/* local write addresses: first offset a */ +v_mul_u32_u24 v10, 0x40, v4 // lwAL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v8, v10, 0x1 // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrA], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping + +/* local write addresses: first offset b */ +v_mul_u32_u24 v10, 0x40, v6 // lwBL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v9, v10, 0x1 // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +v_add_co_u32 v10, vcc, 0x8200, v10 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=33280 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrB], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping +v_mov_b32 v12, MT0 // set MT0 into sgpr +v_mov_b32 v11, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +v_mov_b32 v12, MT1 // set MT1 into sgpr +v_mov_b32 v11, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v10 // set back to numWorkGroup0 +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +s_nop 0 // 1 wait states +v_readfirstlane_b32 s[sgprNumWorkGroups1], v10 // set back to numWorkGroup1 +s_waitcnt lgkmcnt(0) // wait for 44/0 bytes of kern args + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s52, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s53, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s52, s52, s53 +v_cvt_f32_u32 v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s52 +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_mul_u32_u24 v11, v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s52 +v_cmpx_eq_u32 exec, v11, s52 // s52 = s[sgprWorkGroup0] / s52 +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s52 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s52 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup2], s52 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s52, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s52, s52, s[sgprWorkGroup2] +s_mul_i32 s52, s52, s53 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup1], s52 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s52, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 +s_branch label_MultiGemmEnd +label_MultiGemm: + +/* Check if custom structure pointer is null */ +s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? +s_cbranch_scc1 label_IsExternalValid // branch if ArgType == 2 +s_mov_b32 s15, 88 +s_mul_i32 s58, s51, 4 +s_mov_b64 s[52:53], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_branch label_IsExternalValidEnd +label_IsExternalValid: +s_mov_b32 s15, 196 +s_mov_b32 s58, 0 +s_mov_b64 s[52:53], s[sgprKernArgAddress:sgprKernArgAddress+1] +label_IsExternalValidEnd: + +/* Grouped Gemm:: prefetch 1 arg load */ +s_mov_b32 s14, 1 +s_mov_b32 s59, 0 +s_load_dwordx4 s[24:27], s[52:53], s58 +s_cmpk_eq_u32 s51, 1 // if gemm_count is 1? +s_cbranch_scc1 label_wgTable_noLoadLoop + +/* Grouped Gemm:: accumulate numTiles for each gemm */ +/* Grouped Gemm:: loop start */ +label_Loop_GemmCount: +s_waitcnt lgkmcnt(0) +s_lshr_b32 s56, s24, 8 // s56 = s24 / 256 +s_and_b32 s54, 255, s24 // s54 = s24 % 256 +s_addc_u32 s56, s56, 0 +s_lshr_b32 s57, s25, 8 // s57 = s25 / 256 +s_and_b32 s54, 255, s25 // s54 = s25 % 256 +s_addc_u32 s57, s57, 0 +s_mul_i32 s56, s56, s57 +s_mul_i32 s56, s56, s26 +s_and_b32 s57, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s56, s56, s57 +s_add_u32 s59, s59, s56 +s_cmp_lt_u32 s[sgprWorkGroup0], s59 +s_cbranch_scc1 label_FOUND +s_add_u32 s58, s58, s15 +s_load_dwordx4 s[24:27], s[52:53], s58 +s_add_u32 s14, s14, 1 +s_cmp_lt_u32 s14, s51 +s_cbranch_scc1 label_Loop_GemmCount + +/* Grouped Gemm:: noLoadLoop */ +label_wgTable_noLoadLoop: +s_waitcnt lgkmcnt(0) +s_lshr_b32 s56, s24, 8 // s56 = s24 / 256 +s_and_b32 s54, 255, s24 // s54 = s24 % 256 +s_addc_u32 s56, s56, 0 +s_lshr_b32 s57, s25, 8 // s57 = s25 / 256 +s_and_b32 s54, 255, s25 // s54 = s25 % 256 +s_addc_u32 s57, s57, 0 +s_mul_i32 s56, s56, s57 +s_mul_i32 s56, s56, s26 +s_and_b32 s52, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s56, s56, s52 +s_add_u32 s59, s59, s56 + +/* Grouped Gemm:: gemmIndex found */ +label_FOUND: +s_sub_u32 s53, s14, 1 +s_sub_u32 s52, s59, s56 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 +/* Check if custom structure pointer is null */ +s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? +s_cbranch_scc1 label_LoadExternalStruct // branch if ArgType == 2 + +/* Grouped Gemm: offset argument address to gemm */ +/* Grouped Gemm: offset address from wg_table_start to args_start */ +s_lshl2_add_u32 s[sgprKernArgAddress], s51, s[sgprKernArgAddress] +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 +/* Grouped Gemm: offset address from args_start to gemm_start */ +s_mul_i32 s53, s53, 88 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s53 +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 + +/* Load Kernel Args */ +s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_branch label_LoadExternalStructEnd +label_LoadExternalStruct: +/* Grouped Gemm: offset address from args_start to gemm_start */ +s_mul_i32 s53, s53, 196 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s53 +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 +s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 +s_load_dword s44, s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +// Read Beta +s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +label_LoadExternalStructEnd: +/* init: add vgpr [4...136) to pool */ +/* init: add vgpr [0...0) to pool */ +/* init: add agpr [0...256) to pool */ + +/******************************************/ +/* Local Read Addresses */ +/******************************************/ + +/* local read addresses: tile assignments a/b */ +/* lr0I */ +v_and_b32 v5, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v4, 15, v5 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v4, 6, v4 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v4, 3, v4 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v5, 4, v5 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v4, v5, 3, v4 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v8, 6, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64) +v_and_b32 v8, 1, v8 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v4, v8, 13, v4 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset +/* lr1J */ +v_and_b32 v6, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) +v_and_b32 v5, 15, v6 // 1. N offset: nIdx = wtid % MI_N(16) +v_lshlrev_b32 v5, 6, v5 // 1. N offset: nOffset = nIdx * nStride(64) +/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */ +v_lshlrev_b32 v5, 3, v5 // 4. apply VectorWidth: bnOffset = bnOffset * vw(8) +v_lshrrev_b32 v6, 4, v6 // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) +v_lshl_add_u32 v5, v6, 3, v5 // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset +v_lshrrev_b32 v7, 7, v[vgprSerial] // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128) +v_and_b32 v7, 1, v7 // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2) +v_lshl_add_u32 v5, v7, 13, v5 // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset + +/* local read addresses: final offsets a */ +v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 +v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id +s_mov_b32 s53, 64 // LSU offset: stride = lsuStride(64) when umlds==True +v_mul_lo_u32 v6, s53, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1 // Final Offset: offset = (lro0+lsuoffset)*bpeDS +v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 + +/* local read addresses: final offsets b */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id + // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.) +v_mul_lo_u32 v4, s53, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1 // Final Offset: offset = (lro1+lsuoffset)*bpeDS +v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 +v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 + +/* local read addresses: declare addresses a */ +/* N/A */ + +/* local read addresses: declare addresses b */ +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8200, v[vgprLocalReadAddrB+0] // += LdsOffsetB (lower) +v_add_u32 v[vgprLocalReadSwapAddrA], 66560, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping +v_add_u32 v[vgprLocalReadSwapAddrB], 66560, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer +v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping + +/******************************************/ +/* Local Write Addresses */ +/******************************************/ +/* LVCA = 8 */ +/* v5 = A-unroll = serial%LVCA */ +v_lshrrev_b32 v4, 3, v[vgprSerial] // 4 = Serial / 8 +v_and_b32 v5, 7, v[vgprSerial] // 5 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v5, 3, v5 // v5 = v5 * 8 +v_mov_b32 v8, v5 // copy for GlobalSplitU +/* LVCB = 8 */ +/* v7 = B-unroll = serial%LVCB */ +v_lshrrev_b32 v6, 3, v[vgprSerial] // 6 = Serial / 8 +v_and_b32 v7, 7, v[vgprSerial] // 7 = Serial % 8 +/* unroll *= glvw */ +v_lshlrev_b32 v7, 3, v7 // v7 = v7 * 8 +v_mov_b32 v9, v7 // copy for GlobalSplitU +/* lwaUnrollAssignmentA = v8 */ +/* lwaUnrollAssignmentB = v9 */ + +/* local write addresses: first offset a */ +v_mul_u32_u24 v10, 0x40, v4 // lwAL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v8, v10, 0x1 // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrA], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping + +/* local write addresses: first offset b */ +v_mul_u32_u24 v10, 0x40, v6 // lwBL**(DepthU_Compute + PAD) +v_add_lshl_u32 v10, v9, v10, 0x1 // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS +v_lshrrev_b32 v12, 10, v10 // padding 16 per block 1024 +v_lshl_add_u32 v10, v12, 4, v10 // padding 16 per block 1024 +v_add_co_u32 v10, vcc, 0x8200, v10 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=33280 +s_nop 0 // 1 wait states required before reading vgpr by lane +v_readfirstlane_b32 s[sgprLocalWriteAddrB], v10 // Copy lds write address VGPR to SGPR +s_nop 0 // 1 wait states +s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 66560 // Calculate starting lds addr of second buffer +s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping +v_mov_b32 v12, MT0 // set MT0 into sgpr +v_mov_b32 v11, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +v_mov_b32 v12, MT1 // set MT1 into sgpr +v_mov_b32 v11, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v10 // set back to numWorkGroup0 +v_cvt_f32_u32 v10, v12 // v10 = ceil(v11 / v12) +v_rcp_iflag_f32 v10, v10 // v10 = ceil(v11 / v12) +v_cvt_f32_u32 v13, v11 // v10 = ceil(v11 / v12) +v_mul_f32 v10, v10, v13 // v10 = ceil(v11 / v12) +v_cvt_u32_f32 v10, v10 // v10 = ceil(v11 / v12) +v_mul_u32_u24 v13, v10, v12 // v10 = ceil(v11 / v12) +v_sub_u32 v13, v11, v13 // v10 = ceil(v11 / v12) +v_cmp_ne_u32 vcc, v13, 0 // v10 = ceil(v11 / v12) +v_addc_co_u32 v10, vcc, v10, 0, vcc // ceil +s_nop 0 // 1 wait states +v_readfirstlane_b32 s[sgprNumWorkGroups1], v10 // set back to numWorkGroup1 +s_waitcnt lgkmcnt(0) // wait for 44/0 bytes of kern args + +/* Early stop if N(SizeFreeJ) == 0 */ +s_cmp_eq_u32 s[sgprSizeJ], 0 +s_cbranch_scc0 label_NoEarlyStop_N0 +label_EarlyStop_if_N_is_0: +s_endpgm +label_NoEarlyStop_N0: + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s52, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s53, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s52, s52, s53 +v_cvt_f32_u32 v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s52 +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s52 +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s52 +v_mul_u32_u24 v11, v10, s52 // s52 = s[sgprWorkGroup0] / s52 +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s52 +v_cmpx_eq_u32 exec, v11, s52 // s52 = s[sgprWorkGroup0] / s52 +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s52 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s52 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup2], s52 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s52, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s52, s52, s[sgprWorkGroup2] +s_mul_i32 s52, s52, s53 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v10, v10, v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v10, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v11, s[sgprWorkGroup0], v11 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups0] // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v10, 1, v10 // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s52, v10 // quotient +s_mov_b32 s[sgprWorkGroup1], s52 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s52, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52 + +/* Early stop if wg exceed */ +s_cmp_ge_u32 s[sgprWorkGroup2], s[sgprSizesFree+2] +s_cbranch_scc0 label_NoEarlyStop_wgExceed +label_EarlyStop_if_wg_exceed: +s_endpgm +label_NoEarlyStop_wgExceed: + +label_MultiGemmEnd: +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 62 +.set sgprStaggerUIter, 51 +.set sgprWrapUA, 64 +.set sgprWrapUB, 66 +.set sgprGlobalReadIncsA, 68 +.set sgprGlobalReadIncsB, 69 +.set sgprScalarGlobalReadOffsetA, 70 +.set sgprScalarGlobalReadOffsetB, 77 +s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift +s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift +s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift +s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift + +/* Short circuit condition if Alpha == 0, then sumDims=0 */ +v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? +s_cbranch_vccz label_AlphaNonZero // branch if s[Alpha] != 0 +s_mov_b32 s[sgprSizesSum+0], 0 // Set summation dim=0 if Alpha == 0 +label_AlphaNonZero: + +/******************************************/ +/* Begin setupNewTile */ +/******************************************/ + +/* global read addresses: work-group */ +/* graWorkGroup mapping */ +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU // branch if GSU == 1 +// GSU-not-WGMapRR :nwg1 = (size1J + MT1J - 1) / MT1J; +s_and_b32 s84, s[sgprGSU], 0x4000 // SCC = (GSUWGMRR == 1) ? +s_cbranch_scc1 label_GSUWGMRR // branch if GSUWGMRR == 1 +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_sub_u32 v11, s[sgprWorkGroup1], v11 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84 +v_mov_b32 v11, 0 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup1], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v10 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx], v11 // remainder +s_branch label_GSUWGMRR_End +label_GSUWGMRR: +v_cvt_f32_u32 v10, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_rcp_iflag_f32 v10, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_f32 v10, v10, v11 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_u32_f32 v10, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_sub_u32 v11, s[sgprWorkGroup1], v11 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_add_u32 v10, 1, v10 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mov_b32 v11, 0 // s[sgprWorkGroup1] = s[sgprWorkGroup1] % s[sgprNumWorkGroups1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups1] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1] // re-calculate remainder +v_sub_u32 v11, s[sgprWorkGroup1], v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprGSUSumIdx], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v11 // remainder +label_GSUWGMRR_End: +s_mov_b32 s[sgprGSULog2BpeC], 1 +s_mov_b32 s[sgprGSULog2BpeD], 2 +s_branch label_GSU_End +label_GSU: +s_mov_b64 s[sgprGSUSumIdx:sgprGSUSumIdx+1], 0 // Set GSUSumIdx to 0 +s_mov_b32 s[sgprGSULog2BpeC], 1 +s_mov_b32 s[sgprGSULog2BpeD], 1 +label_GSU_End: +s_sext_i32_i16 s[sgprWGM], s[sgprWGM] // Restore WGM +s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? +s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 +s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? +s_cbranch_scc1 label_WGM // branch if WGM >= 0 +s_abs_i32 s[sgprWGM], s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprWorkGroup0] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprWorkGroup0], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s86, v10 // quotient +s_mul_i32 s87, s86, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s87, s[sgprWorkGroup0], s87 // WorkGroup0=remainder +s_mul_i32 s87, s87, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s87, s87, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprNumWorkGroups0] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprNumWorkGroups0], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s84, v10 // quotient +s_mul_i32 s85, s[sgprWGM], s84 // quotient * non-magic divisor +s_sub_u32 s85, s[sgprNumWorkGroups0], s85 // NumWorkGroups0=remainder +s_cmp_eq_u32 s85, 0 // remainder == 0 ? +s_cmov_b32 s85, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s86, s84 // blockId >= numFullBlocks ? +s_cselect_b32 s84, s85, s[sgprWGM] +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup1] = s87 / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup1] = s87 / s84 +v_cvt_f32_u32 v11, s87 // s[sgprWorkGroup1] = s87 / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup1] = s87 / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup1] = s87 / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup1] = s87 / s84 +v_sub_u32 v11, s87, v11 // s[sgprWorkGroup1] = s87 / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup1] = s87 / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup1] = s87 / s84 +v_mov_b32 v11, 0 // s[sgprWorkGroup0] = s87 % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s87, v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v11 // remainder +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s84 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s87, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s86, s86, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s86 // wg1 += blockId * WGM +s_branch label_WGM +label_WGMPositive: +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprWorkGroup1] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprWorkGroup1], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s86, v10 // quotient +s_mul_i32 s87, s86, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s87, s[sgprWorkGroup1], s87 // WorkGroup1=remainder +s_mul_i32 s87, s87, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s87, s87, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v10, s[sgprWGM] // WGM +v_rcp_iflag_f32 v10, v10 // WGM +v_cvt_f32_u32 v11, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v10, v10, v11 // WGM +v_cvt_u32_f32 v10, v10 // WGM +v_mul_u32_u24 v11, v10, s[sgprWGM] // WGM +v_sub_u32 v11, s[sgprNumWorkGroups1], v11 // WGM +v_cmpx_eq_u32 exec, v11, s[sgprWGM] // WGM +v_add_u32 v10, 1, v10 // WGM +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s[sgprWGM] // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s84, v10 // quotient +s_mul_i32 s85, s[sgprWGM], s84 // quotient * non-magic divisor +s_sub_u32 s85, s[sgprNumWorkGroups1], s85 // NumWorkGroups1=remainder +s_cmp_eq_u32 s85, 0 // remainder == 0 ? +s_cmov_b32 s85, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s86, s84 // blockId >= numFullBlocks ? +s_cselect_b32 s84, s85, s[sgprWGM] +v_cvt_f32_u32 v10, s84 // s[sgprWorkGroup0] = s87 / s84 +v_rcp_iflag_f32 v10, v10 // s[sgprWorkGroup0] = s87 / s84 +v_cvt_f32_u32 v11, s87 // s[sgprWorkGroup0] = s87 / s84 +v_mul_f32 v10, v10, v11 // s[sgprWorkGroup0] = s87 / s84 +v_cvt_u32_f32 v10, v10 // s[sgprWorkGroup0] = s87 / s84 +v_mul_u32_u24 v11, v10, s84 // s[sgprWorkGroup0] = s87 / s84 +v_sub_u32 v11, s87, v11 // s[sgprWorkGroup0] = s87 / s84 +v_cmpx_eq_u32 exec, v11, s84 // s[sgprWorkGroup0] = s87 / s84 +v_add_u32 v10, 1, v10 // s[sgprWorkGroup0] = s87 / s84 +v_mov_b32 v11, 0 // s[sgprWorkGroup1] = s87 % s84 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v11, s84 // overflow happened in remainder +v_sub_u32 v10, v10, 1 // quotient - 1 +v_mul_u32_u24 v11, v10, s84 // re-calculate remainder +v_sub_u32 v11, s87, v11 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup0], v10 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v11 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s84 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s87, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s86, s86, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s86 // wg1 += blockId * WGM +label_WGM: + +/* global read addresses: tile offset assignment a */ +/* graTileAssignmentA = v4 */ + +/* global read addresses: tile offset assignment b */ +/* graTileAssignmentB = v6 */ + +/* global read addresses: unroll assignment a */ +/* v5 */ + +/* global read addresses: unroll assignment b */ +/* v7 */ + +/* global read addresses: other free assignments */ +/* s[sgprWorkGroup2] */ + +/* global read addresses: tile offsets a */ + +/* global read addresses: tile offsets b */ + +/* global read addresses: unroll offsets a */ + +/* global read addresses: unroll offsets b */ + +/* global read addresses: final offsets a */ +GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0, 5, 4, 10 // gROA_0_0_0_0 +s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 32 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+0], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 64 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+1], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 96 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+2], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], s[sgprStrideA0I], 128 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+3], s[sgprScalarGlobalReadOffsetA+3], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+4], s[sgprStrideA0I], 160 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+4], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+5], s[sgprStrideA0I], 192 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+5], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetA+6], s[sgprStrideA0I], 224 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+6], 0x1 // scalar offset *= bytes/element + +/* global read addresses: final offsets b */ +GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0, 7, 6, 10 // gROB_0_0_0_0 +s_mul_i32 s[sgprScalarGlobalReadOffsetB+0], s[sgprStrideB1J], 32 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+0], s[sgprScalarGlobalReadOffsetB+0], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+1], s[sgprStrideB1J], 64 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+1], s[sgprScalarGlobalReadOffsetB+1], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+2], s[sgprStrideB1J], 96 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+2], s[sgprScalarGlobalReadOffsetB+2], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+3], s[sgprStrideB1J], 128 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+3], s[sgprScalarGlobalReadOffsetB+3], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+4], s[sgprStrideB1J], 160 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+4], s[sgprScalarGlobalReadOffsetB+4], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+5], s[sgprScalarGlobalReadOffsetB+5], 0x1 // scalar offset *= bytes/element +s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) +s_lshl_b32 s[sgprScalarGlobalReadOffsetB+6], s[sgprScalarGlobalReadOffsetB+6], 0x1 // scalar offset *= bytes/element + +/* global read addresses: addresses a */ +/* max read offset = size[n] * stride[n-1] */ +s_mul_hi_u32 s87, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s86, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s87, s86, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s86, s86, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_A // branch if GSUC == 1 +s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s84, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_A_End +label_GSUC_A: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s84, 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s85, s85, s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s84, s84, s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s84, s84, s85 // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s85, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s84, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_A_End: +s_add_u32 s86, s86, s84 // accum GsuOffset term to tilestart +s_addc_u32 s87, s87, s85 // accum GsuOffset term to tilestart +s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size +s_sub_u32 s84, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s85, constStrideAL, s84 // stride x (size-1) +s_mul_i32 s84, constStrideAL, s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size +s_sub_u32 s84, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s85, s[sgprStrideA0I], s84 // stride x (size-1) +s_mul_i32 s84, s[sgprStrideA0I], s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s86 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s87 // sub tileStart +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x1 // Set limit to use bytes +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s85, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s84, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s86, s86, s84 // accum wg term to tilestart +s_addc_u32 s87, s87, s85 // accum wg term to tilestart +s_lshl_b64 s[86:87], s[86:87], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s86 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s87 // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +/* global read addresses: addresses b */ +/* max read offset = size[n] * stride[n-1] */ +s_mul_hi_u32 s87, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s86, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s87, s86, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s86, s86, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_B // branch if GSUC == 1 +s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s84, 64, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_B_End +label_GSUC_B: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s84, 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s85, s85, s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s84, s84, s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s84, s84, s85 // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s85, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s84, s84, 64 // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_B_End: +s_add_u32 s86, s86, s84 // accum GsuOffset term to tilestart +s_addc_u32 s87, s87, s85 // accum GsuOffset term to tilestart +s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size +s_sub_u32 s84, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s85, constStrideBL, s84 // stride x (size-1) +s_mul_i32 s84, constStrideBL, s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size +s_sub_u32 s84, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s85, s[sgprStrideB1J], s84 // stride x (size-1) +s_mul_i32 s84, s[sgprStrideB1J], s84 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s86 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s87 // sub tileStart +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x1 // Set limit to use bytes +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s85, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s84, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s86, s86, s84 // accum wg term to tilestart +s_addc_u32 s87, s87, s85 // accum wg term to tilestart +s_lshl_b64 s[86:87], s[86:87], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s86 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s87 // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +/* global read addresses: increments a */ +s_and_b32 s85, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s85, s85, DepthU*BpeAGR // GSU*DepthU*Bpe +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR, s85 // incrA (unrollIdx) + +/* global read addresses: increments b */ +s_and_b32 s85, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s85, s85, DepthU*BpeBGR // GSU*DepthU*Bpe +s_and_b32 s84, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR, s85 // incrB (unrollIdx) +/* declare loop num iterations */ +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 6 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 64 +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_1 // branch if GSU == 1 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v5, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v4, v4, v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v4, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v5, s[sgprLoopCounterL], v5 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v4, 1, v4 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v5, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1] // re-calculate remainder +v_sub_u32 v5, s[sgprLoopCounterL], v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprLoopCounterL], v4 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5 // remainder +s_add_u32 s84, 1, s[sgprLoopCounterL] // tmp<-numIterMyWg+1 +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cmov_b32 s[sgprLoopCounterL], s84 // numIterMyWg++ if needed +label_GSU_1: +s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter +s_and_b32 s86, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s86, s86, 0x8 +s_and_b32 s87, s[sgprStaggerU], 0xe000 +s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff +s_mov_b32 s84, s[sgprStaggerU] // init staggerU +label_beginStaggerUIter: +s_lshl_b32 s85, s84, s86 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s85 // loopCount >= current shift Count +s_cbranch_scc1 label_endStaggerUIter // jump to end +s_lshr_b32 s84, s84, 1 // step down to smaller stagger +s_branch label_beginStaggerUIter // jump to begin +label_endStaggerUIter: +s_sub_u32 s85, s84, 1 // staggerU mask +s_cmp_ge_u32 s84, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s85, 0 // set Mask +s_cmp_eq_u32 s87, 0x0 +s_cbranch_scc1 label_StaggerUMapping_1 +s_mov_b32 s84, s[sgprWorkGroup0] +s_branch label_staggerInputEnd +label_StaggerUMapping_1: +s_cmp_eq_u32 s87, 0x2000 +s_cbranch_scc1 label_StaggerUMapping_2 +s_mov_b32 s84, s[sgprWorkGroup1] +s_branch label_staggerInputEnd +label_StaggerUMapping_2: +s_cmp_eq_u32 s87, 0x4000 +s_cbranch_scc1 label_StaggerUMapping_3 +s_mov_b32 s84, -0x1 +s_branch label_staggerInputEnd +label_StaggerUMapping_3: +s_cmp_eq_u32 s87, 0x6000 +s_cbranch_scc1 label_StaggerUMapping_4 +s_mul_i32 s85, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s84, s84, s85 +s_add_u32 s84, s84, s[sgprWorkGroup0] +s_branch label_staggerInputEnd +label_StaggerUMapping_4: +s_cmp_eq_u32 s87, 0x8000 +s_cbranch_scc1 label_staggerInputEnd +s_mov_b32 s84, -0x1 +s_branch label_staggerInputEnd +label_staggerInputEnd: +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s84 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s86 // shift by StaggerUStride + +/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ +s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop +s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop +s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration +s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ +s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop +s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop +s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration +s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap +/* local read addresses: init pointers a */ + +/* localReadInitPointers */ +/* local read addresses: init pointers b */ + +/* localReadInitPointers */ + +/* prefetch: global -> local */ +s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? +s_cbranch_scc1 label_ShadowInitStart // skip to ShadowInitStart iter b/c numIter==0 + + +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +// Interleave Init C +v_accvgpr_write acc0, 0 // initC +v_accvgpr_write acc1, 0 // initC +v_accvgpr_write acc2, 0 // initC +v_accvgpr_write acc3, 0 // initC +v_accvgpr_write acc4, 0 // initC +v_accvgpr_write acc5, 0 // initC +v_accvgpr_write acc6, 0 // initC +v_accvgpr_write acc7, 0 // initC +v_accvgpr_write acc8, 0 // initC +v_accvgpr_write acc9, 0 // initC +v_accvgpr_write acc10, 0 // initC +v_accvgpr_write acc11, 0 // initC +v_accvgpr_write acc12, 0 // initC +v_accvgpr_write acc13, 0 // initC +v_accvgpr_write acc14, 0 // initC +v_accvgpr_write acc15, 0 // initC + +v_mov_b64 v[6:7], 0 +v_mov_b64 v[8:9], 0 + +v_mfma_f32_32x32x16_bf16 acc[16:31], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[32:47], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[48:63], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[64:79], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[80:95], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[96:111], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[112:127], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[128:143], v[6:9], v[6:9], acc[0:15] + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +v_mfma_f32_32x32x16_bf16 acc[144:159], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[160:175], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[176:191], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[192:207], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[208:223], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[224:239], v[6:9], v[6:9], acc[0:15] +v_mfma_f32_32x32x16_bf16 acc[240:255], v[6:9], v[6:9], acc[0:15] + +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +/* global read inc A loopL */ +s_add_u32 s86, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s86 // Is this wrapIter? (pf) +s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +/* global read inc B loopL */ +s_add_u32 s86, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s86 // Is this wrapIter? (pf) +s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + +/******************************************/ +/* End setupNewTile */ +/******************************************/ +label_ShadowInitStart: +s_mov_b64 s[sgprSrdD+0:sgprSrdD+0+1], s[sgprAddressD+0:sgprAddressD+0+1] // init SRD base address +s_mov_b32 s[sgprSrdD+2], BufferOOB +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +s_mov_b64 s[sgprSrdC+0:sgprSrdC+0+1], s[sgprAddressC+0:sgprAddressC+0+1] // init SRD base address +s_mov_b32 s[sgprSrdC+2], BufferOOB +s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD + + +s_mul_i32 s86, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s85, s86, s[sgprStrideC1J] // ScaleC s86 by Stride +s_mul_i32 s84, s86, s[sgprStrideC1J] // ScaleC s86 by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC] // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s85 // add hi to SRD +s_mul_hi_u32 s85, s86, s[sgprStrideD1J] // ScaleD s86 by Stride +s_mul_i32 s84, s86, s[sgprStrideD1J] // ScaleD s86 by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD] // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s85 // add hi to SRD + +s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC] // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s85 // add hi to SRD +s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD] // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85 // add hi to SRD + +s_and_b32 s84, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s84, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_2 // branch if GSU == 1 +// GSU Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * GSUIdx * bpe%s +s_mul_hi_u32 s85, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0 +s_mul_i32 s84, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0 +s_sub_u32 s86, s[sgprSizesFree+1], 1 // Free1 +s_mul_i32 s86, s86, s[sgprGSUSumIdx] // Free1 +s_mul_hi_u32 s87, s86, s[sgprStrideC1J] // Free1 +s_mul_i32 s86, s86, s[sgprStrideC1J] // Free1 +s_add_u32 s84, s84, s86 // Free1 +s_addc_u32 s85, s85, s87 // Free1 +s_sub_u32 s86, s[sgprSizesFree+2], 1 // Free2 +s_mul_i32 s86, s86, s[sgprGSUSumIdx] // Free2 +s_mul_hi_u32 s87, s86, s[sgprStrideCK] // Free2 +s_mul_i32 s86, s86, s[sgprStrideCK] // Free2 +s_add_u32 s84, s84, s86 // Free2 +s_addc_u32 s85, s85, s87 // Free2 +s_lshl_b64 s[84:85], s[84:85], 2 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84 // add lo GSU offset to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85 // add hi GSU offset to SRD +label_GSU_2: +.set sgprGSULog2BpeC, UNDEF +.set sgprAddressC, UNDEF + + +s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? + +/* after InitC, skip to end of prefetch last iter if numIter==0 */ +s_cbranch_scc0 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc1 +s_getpc_b64 s[84:85] // addr of next instr +s_add_i32 s86, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s84, s84, s86 // add target branch offset +s_addc_u32 s85, s85, 0 // add high and carry +s_setpc_b64 s[84:85] // branch to label_PrefetchGlobalLastIterEnd +label_NoBranch_T8JHFHKM7BO5OHXW: + + +/* local write a */ + +/* local write b */ + +/* local write swap a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* local write swap b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // PGR=2 but only 1 loop +s_cbranch_scc1 label_skipPGR2 // PGR=2 but only 1 loop + +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + + +/* local write swap a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* local write swap b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +label_skipPGR2: + +s_waitcnt vmcnt(24) +s_barrier + +/* local read prefetch a */ +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +s_waitcnt vmcnt(16) +s_barrier + +/* local read prefetch b */ +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +s_waitcnt lgkmcnt(0) + + +/******************************************/ +/* Unrolled Loop(s) - Begin */ +/******************************************/ +label_openLoopL: +s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // LoopCounterL < EndCounter +s_cbranch_scc1 label_toPGR1 // PGR=2 but only 1 loop, toPGR1 +s_cmp_le_u32 s[sgprLoopCounterL], 0x2 // LoopCounterL < EndCounter +s_cbranch_scc1 label_LoopEndL // do not enter LoopL + + +// MAIN LOOP MACRO - Shared code between Even/Odd simds +.macro MAINLOOP isOdd + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* global read inc A loopL */ +s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? +s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +s_cselect_b32 s85, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84 // gra SRD += inc(lower) +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc) +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +/* global read inc B loopL */ +s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s85, s[sgprWrapUB+1], 0 // incUpper <- ? +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85 // gra SRD += inc(upper) +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc) +/* mfmaIndex:16 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc) +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +/* mfmaIndex:17 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +/* mfmaIndex:18 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:19 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +s_waitcnt lgkmcnt(0) // wait for A local reads +/* mfmaIndex:21 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +s_barrier + +.if \isOdd == 0 +////////////////////////////////////////////////////////////////////// EVEN WAVES +/* mfmaIndex:22 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 +/* mfmaIndex:23 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:24 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:25 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 , lds // G -> Reg 0_0_1_0 +/* mfmaIndex:26 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:27 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:28 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 , lds // G -> Reg 0_0_2_0 +/* mfmaIndex:29 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:30 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:31 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 , lds // G -> Reg 0_0_3_0 +/* mfmaIndex:32 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:33 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:34 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 , lds // G -> Reg 0_0_4_0 +/* mfmaIndex:35 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:36 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +.else +////////////////////////////////////////////////////////////////////// ODD WAVES +/* mfmaIndex:22 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:23 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 +/* mfmaIndex:24 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:25 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:26 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 , lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:27 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:28 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:29 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 , lds // G -> Reg 0_0_2_0 + +/* mfmaIndex:30 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:31 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:32 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 , lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:33 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:34 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:35 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 , lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:36 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +.endif ////////////////////////////////////////////////////////////////////// END branch + + +/* mfmaIndex:37 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:39 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +/* mfmaIndex:41 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + /* mfmaIndex:43 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] + /* mfmaIndex:47 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +s_waitcnt lgkmcnt(0) +/* mfmaIndex:51 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +s_barrier + +.if \isOdd == 0 +////////////////////////////////////////////////////////////////////// EVEN WAVES +/* mfmaIndex:52 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 , lds // G -> Reg 0_0_5_0 +/* mfmaIndex:53 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:54 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 , lds // G -> Reg 0_0_6_0 +/* mfmaIndex:56 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* mfmaIndex:57 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] + +/* mfmaIndex:58 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 , lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:59 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +/* mfmaIndex:60 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] + +/* mfmaIndex:61 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 + +/* mfmaIndex:62 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:63 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] + +// Iteration one + +/* mfmaIndex:64 */ +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 , lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* local write swap offsets a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] + +.else +////////////////////////////////////////////////////////////////////// ODD WAVES +/* mfmaIndex:52 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] + +/* mfmaIndex:53 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 , lds // G -> Reg 0_0_5_0 + +/* mfmaIndex:54 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:55 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] + +/* mfmaIndex:56 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 , lds // G -> Reg 0_0_6_0 + +/* mfmaIndex:57 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:58 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] + + +/* mfmaIndex:59 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 , lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:60 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +s_mov_b32 m0, s[sgprLocalWriteAddrB] // m0 <- LDS write address + +/* mfmaIndex:61 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] + + +/* mfmaIndex:62 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0 + +/* mfmaIndex:63 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +// Iteration one + +/* mfmaIndex:64 */ +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] + + +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 , lds // G -> Reg 0_0_1_0 + +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line +/* local write swap offsets a */ +s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR + +.endif + + +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] + + +.if \isOdd == 0 + +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 , lds // G -> Reg 0_0_2_0 + +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 , lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 , lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] + + +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_waitcnt vmcnt(13) // wait for previous set of global reads + +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +s_barrier + +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 , lds // G -> Reg 0_0_5_0 + +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + + +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] + + +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 , lds // G -> Reg 0_0_6_0 + + +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] + +.else + +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] + +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 , lds // G -> Reg 0_0_2_0 + + +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 , lds // G -> Reg 0_0_3_0 + +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0 + +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] + + +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] + + +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +s_waitcnt vmcnt(13) // wait for previous set of global reads + +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +s_barrier + +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0 + + +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + + +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 + +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] + +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] + +.endif + + +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +s_add_u32 m0, m0, 4160 // Move LDS write address to next line + +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 + + + +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] + + + +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] + +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] + + +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] + + + +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] + + + +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] + +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] + + +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] + + +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 + + +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] + + +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] + + +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] + + +.if \isOdd == 0 + +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] + +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 , lds // G -> Reg 0_0_7_0 + +.else + +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 + +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] + +.endif + + +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* local write swap offsets b */ +s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +s_cmp_eq_i32 s[sgprLoopCounterL], 0x2 // counterL==2 +s_waitcnt lgkmcnt(0) + +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +.endm + + + +// EVEN SIMDID takes WVLoop0 path, ODD SIMDID takes other path +s_getreg_b32 s86, hwreg(HW_REG_HW_ID, 4, 1) +//s_and_b32 s86, s86, 1 +s_cmp_eq_u32 s86, 0 +s_cbranch_scc0 WVLoop1 + +/******************************************/ +/* Unrolled Loop 1/1 - Begin (Even SIMD) */ +/******************************************/ +WVLoop0: +label_LoopBeginL0: +MAINLOOP 0 +/* closeLoop loopL finalLoop=1 tailLoop=0 */ +s_cbranch_scc0 label_LoopBeginL0 // restart LoopL +s_branch label_LoopEndL + +/******************************************/ +/* Unrolled Loop 1/1 - Begin (Odd SIMD) */ +/******************************************/ +WVLoop1: +label_LoopBeginL1: +MAINLOOP 1 +/* closeLoop loopL finalLoop=1 tailLoop=0 */ +s_cbranch_scc0 label_LoopBeginL1 // restart LoopL + +label_LoopEndL: + +/* Before NLL: Check VGPR.checkin for INT8 LW */ + +/******************************************/ +/* Ord. NoGlobalLoadLoop - Begin */ +/******************************************/ + +/* iter 0 (reset local read pointers iteration) (swap local read pointers iteration) */ +/* grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] + + +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] + + +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] + +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] + +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + + +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] + +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] + +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] + +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] + +/* mfmaIndex:16 */ +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + + + /* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + + /* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ + +/* local read swap offsets a */ +v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk + +/* local read swap offsets b */ +v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk + +/* local read init pointers a */ + +/* localReadInitPointers */ + +/* local read init pointers b */ + +/* localReadInitPointers */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ + +/* iter 1 (swap and reset local write pointers iteration) */ +/* grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ + +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +s_waitcnt vmcnt(0) // wait for global reads with lds + +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +s_barrier +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=1 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ +label_toPGR1: +s_and_b32 s8, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s8, 1 // GSU == 1 ? +s_cbranch_scc0 label_GSU_3 // branch if GSU != 1 + +/******************************************/ +/* Opt. NoLoadLoop - Begin */ +/******************************************/ +s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 +s_cbranch_scc0 label_OptNLL_End // Branch if Beta is not zero + +s_cmp_eq_u32 s[sgprAlpha], 1.0 // Alpha == 1.0 ? +s_cbranch_scc0 label_OptNLL_End // branch if alpha != 1 + +s_and_b32 s84, 255, s[sgprSizeI] // s84 = s[sgprSizeI] % 256 +s_add_u32 s85, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s85 // wg0 >= nwg0-1 ? +s_cselect_b32 s84, s84, 0 // set rMT0 +s_cmpk_gt_u32 s84, 0 // rMT0 > 0 +s_cbranch_scc1 label_OptNLL_End // jump if edges required +s_and_b32 s84, 255, s[sgprSizeJ] // s84 = s[sgprSizeJ] % 256 +s_add_u32 s85, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s85 // wg1 >= nwg1-1 +s_cselect_b32 s84, s84, 0 // set rMT1 +s_cmpk_gt_u32 s84, 0 // rMT1 > 0 +s_cbranch_scc1 label_OptNLL_End // jump if edges required + + + +/* mfmaIndex:0 */ +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:1 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] + +/* mfmaIndex:2 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:3 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] + +/* mfmaIndex:4 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:5 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] + +/* mfmaIndex:6 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:7 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] + +/* mfmaIndex:8 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:9 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] + +/* mfmaIndex:10 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:11 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] + +/* mfmaIndex:12 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:13 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] + +/* mfmaIndex:14 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:15 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] + +/* mfmaIndex:16 */ +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 + +/* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ + +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] + +/* iter 1 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ +label_toPGR1end_OptNLL: +/* Stores for OptNLL */ +label_Summation_End_OptNLL: +/* endSummation: add vgpr [0...132) to pool */ +/* load store sgprs */ + +/* Mapping of Acc register -> C Vgpr register */ +/* computeStoreVgprs */ +v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 +v_lshrrev_b32 v5, 1, v4 // 5 = 4 / 2 +v_mul_lo_u32 v5, 0x10, v5 // wave coordination offset 1 +v_and_b32 v1, 63, v[vgprSerial] // v1 = v[vgprSerial] % 64 +v_lshrrev_b32 v1, 4, v1 // 1 = 1 / 16 +v_lshlrev_b32 v1, 2, v1 // thread0 * continuous_output +v_add_lshl_u32 v1, v5, v1, 3 // coordination 1 = vwB *(wave_id1 + tid1) +v_mul_lo_u32 v2, v1, s[sgprStrideC1J] // offset 1 +v_mul_lo_u32 v3, v1, s[sgprStrideD1J] // offset 1 +v_and_b32 v0, 1, v4 // v0 = v4 % 2 +v_mul_lo_u32 v0, 0x10, v0 // wave coordination offset 0 +v_and_b32 v5, 15, v[vgprSerial] // v5 = v[vgprSerial] % 16 +v_add_lshl_u32 v0, v5, v0, 3 // coordination 0 = vwA * (wave_id0 + tid0) +s_mul_i32 s8, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v0, s8, v0 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s8, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v1, s8, v1 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 + +/******************************************/ +/* Global Write Elements */ +/******************************************/ +label_GW_B0_E0: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_lshl_u32 v7, v3, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+216], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+217], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+218], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+219], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+220], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+221], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+222], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+223], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+224], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+225], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+226], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+227], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+228], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+229], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+230], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+231], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+232], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+233], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+234], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+235], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+236], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+237], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+238], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+239], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+240], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+241], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+242], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+243], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+244], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+245], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+246], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+247], acc127 // copy acc to vreg[223] + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+136], v[vgprValuC+136] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+137], v[vgprValuC+137] // convert C to fp16 +v_pack_b32_f16 v136, v[vgprValuC+136], v[vgprValuC+137] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+138], v[vgprValuC+138] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+139], v[vgprValuC+139] // convert C to fp16 +v_pack_b32_f16 v137, v[vgprValuC+138], v[vgprValuC+139] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+140], v[vgprValuC+140] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+141], v[vgprValuC+141] // convert C to fp16 +v_pack_b32_f16 v138, v[vgprValuC+140], v[vgprValuC+141] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+142], v[vgprValuC+142] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+143], v[vgprValuC+143] // convert C to fp16 +v_pack_b32_f16 v139, v[vgprValuC+142], v[vgprValuC+143] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+144], v[vgprValuC+144] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+145], v[vgprValuC+145] // convert C to fp16 +v_pack_b32_f16 v144, v[vgprValuC+144], v[vgprValuC+145] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+146], v[vgprValuC+146] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+147], v[vgprValuC+147] // convert C to fp16 +v_pack_b32_f16 v145, v[vgprValuC+146], v[vgprValuC+147] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+148], v[vgprValuC+148] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+149], v[vgprValuC+149] // convert C to fp16 +v_pack_b32_f16 v146, v[vgprValuC+148], v[vgprValuC+149] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+150], v[vgprValuC+150] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+151], v[vgprValuC+151] // convert C to fp16 +v_pack_b32_f16 v147, v[vgprValuC+150], v[vgprValuC+151] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+152], v[vgprValuC+152] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+153], v[vgprValuC+153] // convert C to fp16 +v_pack_b32_f16 v152, v[vgprValuC+152], v[vgprValuC+153] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+154], v[vgprValuC+154] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+155], v[vgprValuC+155] // convert C to fp16 +v_pack_b32_f16 v153, v[vgprValuC+154], v[vgprValuC+155] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+156], v[vgprValuC+156] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+157], v[vgprValuC+157] // convert C to fp16 +v_pack_b32_f16 v154, v[vgprValuC+156], v[vgprValuC+157] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+158], v[vgprValuC+158] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+159], v[vgprValuC+159] // convert C to fp16 +v_pack_b32_f16 v155, v[vgprValuC+158], v[vgprValuC+159] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+160], v[vgprValuC+160] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+161], v[vgprValuC+161] // convert C to fp16 +v_pack_b32_f16 v160, v[vgprValuC+160], v[vgprValuC+161] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+162], v[vgprValuC+162] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+163], v[vgprValuC+163] // convert C to fp16 +v_pack_b32_f16 v161, v[vgprValuC+162], v[vgprValuC+163] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+164], v[vgprValuC+164] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+165], v[vgprValuC+165] // convert C to fp16 +v_pack_b32_f16 v162, v[vgprValuC+164], v[vgprValuC+165] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+166], v[vgprValuC+166] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+167], v[vgprValuC+167] // convert C to fp16 +v_pack_b32_f16 v163, v[vgprValuC+166], v[vgprValuC+167] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+168], v[vgprValuC+168] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+169], v[vgprValuC+169] // convert C to fp16 +v_pack_b32_f16 v168, v[vgprValuC+168], v[vgprValuC+169] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+170], v[vgprValuC+170] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+171], v[vgprValuC+171] // convert C to fp16 +v_pack_b32_f16 v169, v[vgprValuC+170], v[vgprValuC+171] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+172], v[vgprValuC+172] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+173], v[vgprValuC+173] // convert C to fp16 +v_pack_b32_f16 v170, v[vgprValuC+172], v[vgprValuC+173] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+174], v[vgprValuC+174] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+175], v[vgprValuC+175] // convert C to fp16 +v_pack_b32_f16 v171, v[vgprValuC+174], v[vgprValuC+175] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+176], v[vgprValuC+176] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+177], v[vgprValuC+177] // convert C to fp16 +v_pack_b32_f16 v176, v[vgprValuC+176], v[vgprValuC+177] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+178], v[vgprValuC+178] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+179], v[vgprValuC+179] // convert C to fp16 +v_pack_b32_f16 v177, v[vgprValuC+178], v[vgprValuC+179] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+180], v[vgprValuC+180] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+181], v[vgprValuC+181] // convert C to fp16 +v_pack_b32_f16 v178, v[vgprValuC+180], v[vgprValuC+181] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+182], v[vgprValuC+182] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+183], v[vgprValuC+183] // convert C to fp16 +v_pack_b32_f16 v179, v[vgprValuC+182], v[vgprValuC+183] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+184], v[vgprValuC+184] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+185], v[vgprValuC+185] // convert C to fp16 +v_pack_b32_f16 v184, v[vgprValuC+184], v[vgprValuC+185] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+186], v[vgprValuC+186] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+187], v[vgprValuC+187] // convert C to fp16 +v_pack_b32_f16 v185, v[vgprValuC+186], v[vgprValuC+187] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+188], v[vgprValuC+188] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+189], v[vgprValuC+189] // convert C to fp16 +v_pack_b32_f16 v186, v[vgprValuC+188], v[vgprValuC+189] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+190], v[vgprValuC+190] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+191], v[vgprValuC+191] // convert C to fp16 +v_pack_b32_f16 v187, v[vgprValuC+190], v[vgprValuC+191] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+192], v[vgprValuC+192] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+193], v[vgprValuC+193] // convert C to fp16 +v_pack_b32_f16 v192, v[vgprValuC+192], v[vgprValuC+193] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+194], v[vgprValuC+194] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+195], v[vgprValuC+195] // convert C to fp16 +v_pack_b32_f16 v193, v[vgprValuC+194], v[vgprValuC+195] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+196], v[vgprValuC+196] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+197], v[vgprValuC+197] // convert C to fp16 +v_pack_b32_f16 v194, v[vgprValuC+196], v[vgprValuC+197] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+198], v[vgprValuC+198] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+199], v[vgprValuC+199] // convert C to fp16 +v_pack_b32_f16 v195, v[vgprValuC+198], v[vgprValuC+199] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+200], v[vgprValuC+200] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+201], v[vgprValuC+201] // convert C to fp16 +v_pack_b32_f16 v200, v[vgprValuC+200], v[vgprValuC+201] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+202], v[vgprValuC+202] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+203], v[vgprValuC+203] // convert C to fp16 +v_pack_b32_f16 v201, v[vgprValuC+202], v[vgprValuC+203] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+204], v[vgprValuC+204] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+205], v[vgprValuC+205] // convert C to fp16 +v_pack_b32_f16 v202, v[vgprValuC+204], v[vgprValuC+205] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+206], v[vgprValuC+206] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+207], v[vgprValuC+207] // convert C to fp16 +v_pack_b32_f16 v203, v[vgprValuC+206], v[vgprValuC+207] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+208], v[vgprValuC+208] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+209], v[vgprValuC+209] // convert C to fp16 +v_pack_b32_f16 v208, v[vgprValuC+208], v[vgprValuC+209] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+210], v[vgprValuC+210] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+211], v[vgprValuC+211] // convert C to fp16 +v_pack_b32_f16 v209, v[vgprValuC+210], v[vgprValuC+211] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+212], v[vgprValuC+212] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+213], v[vgprValuC+213] // convert C to fp16 +v_pack_b32_f16 v210, v[vgprValuC+212], v[vgprValuC+213] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+214], v[vgprValuC+214] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+215], v[vgprValuC+215] // convert C to fp16 +v_pack_b32_f16 v211, v[vgprValuC+214], v[vgprValuC+215] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+216], v[vgprValuC+216] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+217], v[vgprValuC+217] // convert C to fp16 +v_pack_b32_f16 v216, v[vgprValuC+216], v[vgprValuC+217] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+218], v[vgprValuC+218] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+219], v[vgprValuC+219] // convert C to fp16 +v_pack_b32_f16 v217, v[vgprValuC+218], v[vgprValuC+219] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+220], v[vgprValuC+220] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+221], v[vgprValuC+221] // convert C to fp16 +v_pack_b32_f16 v218, v[vgprValuC+220], v[vgprValuC+221] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+222], v[vgprValuC+222] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+223], v[vgprValuC+223] // convert C to fp16 +v_pack_b32_f16 v219, v[vgprValuC+222], v[vgprValuC+223] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+224], v[vgprValuC+224] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+225], v[vgprValuC+225] // convert C to fp16 +v_pack_b32_f16 v224, v[vgprValuC+224], v[vgprValuC+225] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+226], v[vgprValuC+226] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+227], v[vgprValuC+227] // convert C to fp16 +v_pack_b32_f16 v225, v[vgprValuC+226], v[vgprValuC+227] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+228], v[vgprValuC+228] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+229], v[vgprValuC+229] // convert C to fp16 +v_pack_b32_f16 v226, v[vgprValuC+228], v[vgprValuC+229] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+230], v[vgprValuC+230] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+231], v[vgprValuC+231] // convert C to fp16 +v_pack_b32_f16 v227, v[vgprValuC+230], v[vgprValuC+231] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+232], v[vgprValuC+232] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+233], v[vgprValuC+233] // convert C to fp16 +v_pack_b32_f16 v232, v[vgprValuC+232], v[vgprValuC+233] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+234], v[vgprValuC+234] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+235], v[vgprValuC+235] // convert C to fp16 +v_pack_b32_f16 v233, v[vgprValuC+234], v[vgprValuC+235] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+236], v[vgprValuC+236] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+237], v[vgprValuC+237] // convert C to fp16 +v_pack_b32_f16 v234, v[vgprValuC+236], v[vgprValuC+237] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+238], v[vgprValuC+238] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+239], v[vgprValuC+239] // convert C to fp16 +v_pack_b32_f16 v235, v[vgprValuC+238], v[vgprValuC+239] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+240], v[vgprValuC+240] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+241], v[vgprValuC+241] // convert C to fp16 +v_pack_b32_f16 v240, v[vgprValuC+240], v[vgprValuC+241] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+242], v[vgprValuC+242] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+243], v[vgprValuC+243] // convert C to fp16 +v_pack_b32_f16 v241, v[vgprValuC+242], v[vgprValuC+243] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+244], v[vgprValuC+244] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+245], v[vgprValuC+245] // convert C to fp16 +v_pack_b32_f16 v242, v[vgprValuC+244], v[vgprValuC+245] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+246], v[vgprValuC+246] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+247], v[vgprValuC+247] // convert C to fp16 +v_pack_b32_f16 v243, v[vgprValuC+246], v[vgprValuC+247] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[240:243], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+16], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+20], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+21], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+22], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+23], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+24], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+25], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+26], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+27], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+28], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+29], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+30], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+31], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+32], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+33], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+34], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+35], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+36], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+37], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+38], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+39], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+40], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+41], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+42], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+43], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+44], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+45], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+46], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+47], acc255 // copy acc to vreg[255] + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[16:19], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v7, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End // jump to end +label_GW_End: + +s_endpgm // Kernel End +label_OptNLL_End: +label_GSU_3: + +/******************************************/ +/* Ord. NoLoadLoop - Begin */ +/******************************************/ + +/* iter 0 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:0 */ +s_waitcnt lgkmcnt(7) // wait for prior local read local write old=0, new=7 newLW=0 newLR=7 for iteration == 0 +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:1 */ +ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:2 */ +ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:3 */ +ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:4 */ +ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:5 */ +ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:6 */ +ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:7 */ +ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:8 */ +ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +s_waitcnt lgkmcnt(8) // wait for prior local read local write +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:9 */ +ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:10 */ +ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:11 */ +ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:12 */ +ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:13 */ +ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:14 */ +ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:15 */ +ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:16 */ +ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 +/* localReadsVacancy: latencyLeft 1 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:17 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:18 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:19 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:20 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:21 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:22 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:23 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:24 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:25 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:26 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:27 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:28 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:29 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:30 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:31 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:32 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:33 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:34 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:35 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:36 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:37 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:38 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:39 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:40 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:41 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:42 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:43 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:44 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:45 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:46 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:47 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:48 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:49 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:50 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:51 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:52 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:53 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:54 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:55 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:56 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:57 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:58 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:59 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:60 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:61 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:62 */ +/* schedule remaining localreads for one buffer scheduling */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:63 */ +/* localReadsVacancy: latencyLeft 5 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */ +/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ + +/* iter 1 (last unrolled loop) */ +/* grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63 */ +/* numMfmaForLR:20, syncPlrMfmaIndex:107 */ +/* mfmaIndex:64 */ +s_waitcnt lgkmcnt(0) // wait for prior local read local write old=0, new=0 newLW=0 newLR=0 +v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] +/* mfmaIndex:65 */ +v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] +/* mfmaIndex:66 */ +v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] +/* mfmaIndex:67 */ +v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] +/* mfmaIndex:68 */ +v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] +/* mfmaIndex:69 */ +v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] +/* mfmaIndex:70 */ +v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] +/* mfmaIndex:71 */ +v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] +/* mfmaIndex:72 */ +v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] +/* mfmaIndex:73 */ +v_mfma_f32_16x16x32_f16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0] +/* mfmaIndex:74 */ +v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0] +/* mfmaIndex:75 */ +v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] +/* mfmaIndex:76 */ +v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] +/* mfmaIndex:77 */ +v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] +/* mfmaIndex:78 */ +v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] +/* mfmaIndex:79 */ +v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] +/* mfmaIndex:80 */ +v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] +/* mfmaIndex:81 */ +v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] +/* mfmaIndex:82 */ +v_mfma_f32_16x16x32_f16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0] +/* mfmaIndex:83 */ +v_mfma_f32_16x16x32_f16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0] +/* mfmaIndex:84 */ +v_mfma_f32_16x16x32_f16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0] +/* mfmaIndex:85 */ +v_mfma_f32_16x16x32_f16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0] +/* mfmaIndex:86 */ +v_mfma_f32_16x16x32_f16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0] +/* mfmaIndex:87 */ +v_mfma_f32_16x16x32_f16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0] +/* mfmaIndex:88 */ +v_mfma_f32_16x16x32_f16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0] +/* mfmaIndex:89 */ +v_mfma_f32_16x16x32_f16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0] +/* mfmaIndex:90 */ +v_mfma_f32_16x16x32_f16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0] +/* mfmaIndex:91 */ +v_mfma_f32_16x16x32_f16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0] +/* mfmaIndex:92 */ +v_mfma_f32_16x16x32_f16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0] +/* mfmaIndex:93 */ +v_mfma_f32_16x16x32_f16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0] +/* mfmaIndex:94 */ +v_mfma_f32_16x16x32_f16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0] +/* mfmaIndex:95 */ +v_mfma_f32_16x16x32_f16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0] +/* mfmaIndex:96 */ +v_mfma_f32_16x16x32_f16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0] +/* mfmaIndex:97 */ +v_mfma_f32_16x16x32_f16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0] +/* mfmaIndex:98 */ +v_mfma_f32_16x16x32_f16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0] +/* mfmaIndex:99 */ +v_mfma_f32_16x16x32_f16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0] +/* mfmaIndex:100 */ +v_mfma_f32_16x16x32_f16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0] +/* mfmaIndex:101 */ +v_mfma_f32_16x16x32_f16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0] +/* mfmaIndex:102 */ +v_mfma_f32_16x16x32_f16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0] +/* mfmaIndex:103 */ +v_mfma_f32_16x16x32_f16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0] +/* mfmaIndex:104 */ +v_mfma_f32_16x16x32_f16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0] +/* mfmaIndex:105 */ +v_mfma_f32_16x16x32_f16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0] +/* mfmaIndex:106 */ +v_mfma_f32_16x16x32_f16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0] +/* mfmaIndex:107 */ +v_mfma_f32_16x16x32_f16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0] +/* mfmaIndex:108 */ +v_mfma_f32_16x16x32_f16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0] +/* mfmaIndex:109 */ +v_mfma_f32_16x16x32_f16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0] +/* mfmaIndex:110 */ +v_mfma_f32_16x16x32_f16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0] +/* mfmaIndex:111 */ +v_mfma_f32_16x16x32_f16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0] +/* mfmaIndex:112 */ +v_mfma_f32_16x16x32_f16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0] +/* mfmaIndex:113 */ +v_mfma_f32_16x16x32_f16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0] +/* mfmaIndex:114 */ +v_mfma_f32_16x16x32_f16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0] +/* mfmaIndex:115 */ +v_mfma_f32_16x16x32_f16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0] +/* mfmaIndex:116 */ +v_mfma_f32_16x16x32_f16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0] +/* mfmaIndex:117 */ +v_mfma_f32_16x16x32_f16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0] +/* mfmaIndex:118 */ +v_mfma_f32_16x16x32_f16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0] +/* mfmaIndex:119 */ +v_mfma_f32_16x16x32_f16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0] +/* mfmaIndex:120 */ +v_mfma_f32_16x16x32_f16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0] +/* mfmaIndex:121 */ +v_mfma_f32_16x16x32_f16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0] +/* mfmaIndex:122 */ +v_mfma_f32_16x16x32_f16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0] +/* mfmaIndex:123 */ +v_mfma_f32_16x16x32_f16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0] +/* mfmaIndex:124 */ +v_mfma_f32_16x16x32_f16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0] +/* mfmaIndex:125 */ +v_mfma_f32_16x16x32_f16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0] +/* mfmaIndex:126 */ +v_mfma_f32_16x16x32_f16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0] +/* mfmaIndex:127 */ +v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0] +/* numPrefetchIter=0 */ +/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */ +/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ +label_toPGR1end_OrdNLL: +label_PrefetchGlobalLastIterEnd: + +/* Tail: add ValuA/B vgpr buffer [4...132) to pool */ + +/* Tail: add address/G2L vgpr [132...132) to pool */ +label_Summation_End_S4FDBQ587JJL6NOU: +.set sgprWGM, UNDEF +.set sgprLoopCounterL, UNDEF +.set sgprOrigLoopCounter, UNDEF +.set sgprAddressA, UNDEF +.set sgprAddressB, UNDEF +.set sgprStridesA, UNDEF +.set sgprStridesB, UNDEF +.set sgprStaggerUIter, UNDEF +.set sgprSrdA, UNDEF +.set sgprSrdB, UNDEF +.set sgprShadowLimitA, UNDEF +.set sgprShadowLimitB, UNDEF +.set sgprWrapUA, UNDEF +.set sgprWrapUB, UNDEF +.set sgprGlobalReadIncsA, UNDEF +.set sgprGlobalReadIncsB, UNDEF +.set sgprScalarGlobalReadOffsetA, UNDEF +.set sgprScalarGlobalReadOffsetB, UNDEF +/* load store sgprs */ + +/* Mapping of Acc register -> C Vgpr register */ + +/* not-LocalSplitU: global write indices */ +/* computeStoreVgprs */ +v_lshrrev_b32 v8, 6, v[vgprSerial] // 8 = Serial / 64 +v_lshrrev_b32 v9, 1, v8 // 9 = 8 / 2 +v_mul_lo_u32 v9, 0x10, v9 // wave coordination offset 1 +v_and_b32 v5, 63, v[vgprSerial] // v5 = v[vgprSerial] % 64 +v_lshrrev_b32 v5, 4, v5 // 5 = 5 / 16 +v_lshlrev_b32 v5, 2, v5 // thread0 * continuous_output +v_add_lshl_u32 v5, v9, v5, 3 // coordination 1 = vwB *(wave_id1 + tid1) +v_mul_lo_u32 v6, v5, s[sgprStrideC1J] // offset 1 +v_mul_lo_u32 v7, v5, s[sgprStrideD1J] // offset 1 +v_and_b32 v4, 1, v8 // v4 = v8 % 2 +v_mul_lo_u32 v4, 0x10, v4 // wave coordination offset 0 +v_and_b32 v9, 15, v[vgprSerial] // v9 = v[vgprSerial] % 16 +v_add_lshl_u32 v4, v9, v4, 3 // coordination 0 = vwA * (wave_id0 + tid0) +s_mul_i32 s8, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v4, s8, v4 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s8, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v5, s8, v5 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 + +/* not-LocalSplitU: global write */ + +/******************************************/ +/* Global Write Elements */ +/******************************************/ +s_and_b32 s8, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s8, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_4 // branch if GSU == 1 +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B0_E1_M // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B0_E1_N // jump if edges required +label_GW_B0_E0_1: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_lshl_u32 v11, v7, v4, 0x2 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+216], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+217], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+218], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+219], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+220], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+221], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+222], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+223], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+224], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+225], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+226], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+227], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+228], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+229], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+230], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+231], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+232], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+233], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+234], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+235], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+236], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+237], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+238], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+239], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+240], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+241], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+242], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+243], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+244], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+245], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+246], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+247], acc127 // copy acc to vreg[223] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[84:87], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[92:95], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[100:103], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[108:111], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[116:119], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[124:127], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[140:143], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[148:151], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[156:159], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[164:167], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[172:175], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[180:183], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[188:191], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[196:199], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[204:207], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[212:215], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[220:223], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[228:231], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[236:239], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[240:243], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[244:247], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+16], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+20], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+21], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+22], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+23], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+24], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+25], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+26], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+27], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+28], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+29], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+30], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+31], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+32], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+33], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+34], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+35], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+36], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+37], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+38], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+39], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+40], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+41], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+42], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+43], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+44], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+45], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+46], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+47], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ + +/* apply mask, calc new C and issue writes */ +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_lshl_b32 s12, s[sgprStrideD1J], 2 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_B0_E1_N: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v11, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v12, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v12, v10, v12, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v13, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v13, v10, v13, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v14, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v14, v10, v14, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[24:27], v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[32:35], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[40:43], v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[56:59], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[64:67], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[72:75], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[80:83], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[84:87], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[88:91], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[92:95], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[96:99], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[100:103], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[104:107], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[108:111], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[112:115], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[116:119], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[120:123], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[124:127], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[136:139], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[140:143], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[144:147], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[148:151], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[152:155], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[156:159], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[160:163], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[164:167], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[168:171], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[172:175], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[176:179], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[180:183], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[184:187], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[188:191], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[192:195], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[196:199], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[200:203], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[204:207], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[208:211], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[212:215], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v11, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v12, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v12, v10, v12, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v13, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v13, v10, v13, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v14, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v14, v10, v14, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v81, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v81, v10, v81, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+17], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+18], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+19], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+20], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+21], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+22], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+23], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+24], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+25], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+26], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+27], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+28], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+29], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+30], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+31], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+32], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+33], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+34], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+35], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+36], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+37], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+38], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+39], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+40], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+41], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+42], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+43], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+44], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+45], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+46], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+47], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+48], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+49], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+50], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+51], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+52], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+53], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+54], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+55], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+56], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+57], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+58], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+59], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+60], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+61], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+62], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+63], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+64], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+65], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+66], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+67], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+68], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+69], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+70], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+71], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+72], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+73], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+74], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+75], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+76], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+77], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+78], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+79], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[20:23], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[24:27], v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[28:31], v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[32:35], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[36:39], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[40:43], v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[44:47], v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[56:59], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[60:63], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[64:67], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[68:71], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +buffer_store_dwordx4 v[72:75], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dwordx4 v[76:79], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_B0_E1_M: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=116 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v127, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v127, v10, v127, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+12], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+13], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+14], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+15], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+16], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+17], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+18], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+19], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+20], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+21], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+22], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+23], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+24], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+25], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+26], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+27], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+28], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+29], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+30], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+31], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+32], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+33], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+34], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+35], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+36], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+37], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+38], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+39], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+40], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+41], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+42], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+43], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+44], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+45], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+46], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+47], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+48], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+49], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+50], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+51], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+52], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+53], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+54], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+55], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+56], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+57], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+58], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+59], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+60], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+61], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+62], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+63], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+64], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+65], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+66], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+67], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+68], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+69], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+70], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+71], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+72], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+73], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+74], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+75], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+76], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+77], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+78], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+79], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+80], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+81], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+82], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+83], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+84], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+85], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+86], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+87], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+88], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+89], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+90], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+91], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+92], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+93], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+94], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+95], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+96], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+97], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+98], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+99], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+100], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+101], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+102], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+103], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+104], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+105], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+106], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+107], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+108], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+109], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+110], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+111], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+112], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+113], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+114], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+115], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+116], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+117], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+118], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+119], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+120], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+121], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+122], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+123], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+124], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+125], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+126], acc205 // copy acc to vreg[115] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v11, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v12, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v13, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v14, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v15, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v43, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v44, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v45, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v46, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v47, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v49, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v50, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v51, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v52, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v53, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v55, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v56, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v57, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v58, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v59, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v60, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v61, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v62, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v63, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v64, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v65, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v66, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v67, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v68, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v69, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v70, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v71, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v72, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v73, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v74, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v75, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v76, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v77, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v78, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v79, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v80, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v81, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v82, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v83, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v84, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v85, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v86, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v87, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v88, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v89, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v90, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v91, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v92, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v93, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v94, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v95, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v96, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v97, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v98, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v99, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v100, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v101, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v102, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v103, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v104, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v105, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v106, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v107, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v108, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v109, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v110, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v111, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v112, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v113, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v114, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v115, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v116, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v117, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v118, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v119, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v120, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v121, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v122, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v123, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v124, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v125, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v126, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1); (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v127, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v127, v10, v127, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+12], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+13], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+14], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+15], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+16], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+17], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+18], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+19], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+20], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+21], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+22], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+23], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+24], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+25], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+26], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+27], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+28], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+29], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+30], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+31], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+32], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+33], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+34], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+35], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+36], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+37], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+38], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+39], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+40], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+41], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+42], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+43], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+44], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+45], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+46], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+47], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+48], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+49], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+50], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+51], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+52], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+53], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+54], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+55], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+56], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+57], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+58], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+59], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+60], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+61], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+62], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+63], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+64], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+65], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+66], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+67], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+68], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+69], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+70], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+71], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+72], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+73], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+74], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+75], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+76], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+77], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+78], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+79], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+80], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+81], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+82], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+83], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+84], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+85], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+86], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+87], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+88], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+89], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+90], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+91], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+92], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+93], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+94], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+95], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+96], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+97], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+98], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+99], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+100], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+101], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+102], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+103], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+104], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+105], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+106], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+107], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+108], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+109], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+110], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+111], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+112], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+113], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+114], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+115], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+116], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+117], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+118], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+119], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+120], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+121], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+122], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+123], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+124], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+125], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+126], acc159 // copy acc to vreg[231] + +/* rC *= alpha batchElements=[(0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3), (0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v11, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v12, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v13, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v14, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v15, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v35, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v36, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v37, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v38, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v39, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v40, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v41, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v42, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v43, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v44, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v45, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v46, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v47, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v49, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v50, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v51, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v52, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v53, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v55, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v56, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v57, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v58, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v59, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v60, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v61, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v62, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v63, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v64, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v65, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v66, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v67, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v68, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v69, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v70, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v71, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v72, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v73, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v74, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v75, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v76, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v77, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v78, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v79, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v80, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v81, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v82, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v83, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v84, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v85, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v86, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v87, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v88, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v89, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v90, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v91, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v92, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v93, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v94, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v95, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v96, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v97, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v98, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v99, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v100, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v101, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v102, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v103, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v104, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v105, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v106, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v107, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v108, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v109, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v110, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v111, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v112, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v113, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v114, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v115, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v116, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v117, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v118, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v119, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v120, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v121, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v122, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v123, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v124, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v125, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v126, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v35, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v35, v10, v35, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v36, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v36, v10, v36, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v37, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v37, v10, v37, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v38, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v38, v10, v38, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v39, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v39, v10, v39, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v40, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v40, v10, v40, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v41, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v41, v10, v41, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v42, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v42, v10, v42, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v43, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v43, v10, v43, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v45, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v45, v10, v45, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v47, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v47, v10, v47, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v49, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v49, v10, v49, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v51, v7, v4, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v51, v10, v51, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v53, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v53, v10, v53, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v55, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v55, v10, v55, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v57, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v57, v10, v57, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v7, v8, 0x2 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+12], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+13], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+14], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+15], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+16], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+17], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+18], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+19], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+20], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+21], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+22], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+23], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+24], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+25], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+26], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+27], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+28], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+29], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+30], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+31], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+32], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+33], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+34], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ + +/* apply mask, calc new C and issue writes */ +buffer_store_dword v11, v35, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v12, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v13, v37, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v14, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v15, v39, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v16, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v17, v41, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v18, v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v19, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v20, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v21, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v22, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v23, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v24, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v25, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v26, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v27, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v28, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v29, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v30, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v31, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v32, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v33, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +buffer_store_dword v34, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_1 // jump to end +label_GW_End_1: +s_getpc_b64 s[30:31] // addr of next instr +s_add_i32 s32, label_KernelEnd, 4 // target branch offset +s_add_u32 s30, s30, s32 // add target branch offset +s_addc_u32 s31, s31, 0 // add high and carry +s_setpc_b64 s[30:31] // branch to label_KernelEnd +label_GSU_4: +s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 +s_cbranch_scc0 label_GW_Beta_2 // Branch if Beta is not zero + +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B0_E1_M_1 // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B0_E1_N_1 // jump if edges required +label_GW_B0_E0_2: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_lshl_u32 v11, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+216], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+217], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+218], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+219], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+220], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+221], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+222], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+223], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+224], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+225], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+226], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+227], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+228], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+229], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+230], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+231], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+232], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+233], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+234], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+235], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+236], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+237], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+238], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+239], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+240], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+241], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+242], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+243], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+244], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+245], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+246], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+247], acc127 // copy acc to vreg[223] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+216:vgprValuC+216+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+216:vgprValuC+216+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+218:vgprValuC+218+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+218:vgprValuC+218+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+220:vgprValuC+220+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+220:vgprValuC+220+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+222:vgprValuC+222+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+222:vgprValuC+222+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+224:vgprValuC+224+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+224:vgprValuC+224+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+226:vgprValuC+226+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+226:vgprValuC+226+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+228:vgprValuC+228+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+228:vgprValuC+228+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+230:vgprValuC+230+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+230:vgprValuC+230+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+232:vgprValuC+232+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+232:vgprValuC+232+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+234:vgprValuC+234+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+234:vgprValuC+234+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+236:vgprValuC+236+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+236:vgprValuC+236+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+238:vgprValuC+238+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+238:vgprValuC+238+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+240:vgprValuC+240+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+240:vgprValuC+240+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+242:vgprValuC+242+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+242:vgprValuC+242+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+244:vgprValuC+244+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+244:vgprValuC+244+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+246:vgprValuC+246+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+246:vgprValuC+246+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+136], v[vgprValuC+136] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+137], v[vgprValuC+137] // convert C to fp16 +v_pack_b32_f16 v136, v[vgprValuC+136], v[vgprValuC+137] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+138], v[vgprValuC+138] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+139], v[vgprValuC+139] // convert C to fp16 +v_pack_b32_f16 v137, v[vgprValuC+138], v[vgprValuC+139] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+140], v[vgprValuC+140] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+141], v[vgprValuC+141] // convert C to fp16 +v_pack_b32_f16 v138, v[vgprValuC+140], v[vgprValuC+141] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+142], v[vgprValuC+142] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+143], v[vgprValuC+143] // convert C to fp16 +v_pack_b32_f16 v139, v[vgprValuC+142], v[vgprValuC+143] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+144], v[vgprValuC+144] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+145], v[vgprValuC+145] // convert C to fp16 +v_pack_b32_f16 v144, v[vgprValuC+144], v[vgprValuC+145] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+146], v[vgprValuC+146] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+147], v[vgprValuC+147] // convert C to fp16 +v_pack_b32_f16 v145, v[vgprValuC+146], v[vgprValuC+147] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+148], v[vgprValuC+148] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+149], v[vgprValuC+149] // convert C to fp16 +v_pack_b32_f16 v146, v[vgprValuC+148], v[vgprValuC+149] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+150], v[vgprValuC+150] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+151], v[vgprValuC+151] // convert C to fp16 +v_pack_b32_f16 v147, v[vgprValuC+150], v[vgprValuC+151] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+152], v[vgprValuC+152] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+153], v[vgprValuC+153] // convert C to fp16 +v_pack_b32_f16 v152, v[vgprValuC+152], v[vgprValuC+153] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+154], v[vgprValuC+154] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+155], v[vgprValuC+155] // convert C to fp16 +v_pack_b32_f16 v153, v[vgprValuC+154], v[vgprValuC+155] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+156], v[vgprValuC+156] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+157], v[vgprValuC+157] // convert C to fp16 +v_pack_b32_f16 v154, v[vgprValuC+156], v[vgprValuC+157] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+158], v[vgprValuC+158] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+159], v[vgprValuC+159] // convert C to fp16 +v_pack_b32_f16 v155, v[vgprValuC+158], v[vgprValuC+159] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+160], v[vgprValuC+160] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+161], v[vgprValuC+161] // convert C to fp16 +v_pack_b32_f16 v160, v[vgprValuC+160], v[vgprValuC+161] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+162], v[vgprValuC+162] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+163], v[vgprValuC+163] // convert C to fp16 +v_pack_b32_f16 v161, v[vgprValuC+162], v[vgprValuC+163] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+164], v[vgprValuC+164] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+165], v[vgprValuC+165] // convert C to fp16 +v_pack_b32_f16 v162, v[vgprValuC+164], v[vgprValuC+165] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+166], v[vgprValuC+166] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+167], v[vgprValuC+167] // convert C to fp16 +v_pack_b32_f16 v163, v[vgprValuC+166], v[vgprValuC+167] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+168], v[vgprValuC+168] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+169], v[vgprValuC+169] // convert C to fp16 +v_pack_b32_f16 v168, v[vgprValuC+168], v[vgprValuC+169] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+170], v[vgprValuC+170] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+171], v[vgprValuC+171] // convert C to fp16 +v_pack_b32_f16 v169, v[vgprValuC+170], v[vgprValuC+171] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+172], v[vgprValuC+172] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+173], v[vgprValuC+173] // convert C to fp16 +v_pack_b32_f16 v170, v[vgprValuC+172], v[vgprValuC+173] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+174], v[vgprValuC+174] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+175], v[vgprValuC+175] // convert C to fp16 +v_pack_b32_f16 v171, v[vgprValuC+174], v[vgprValuC+175] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[168:171], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+176], v[vgprValuC+176] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+177], v[vgprValuC+177] // convert C to fp16 +v_pack_b32_f16 v176, v[vgprValuC+176], v[vgprValuC+177] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+178], v[vgprValuC+178] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+179], v[vgprValuC+179] // convert C to fp16 +v_pack_b32_f16 v177, v[vgprValuC+178], v[vgprValuC+179] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+180], v[vgprValuC+180] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+181], v[vgprValuC+181] // convert C to fp16 +v_pack_b32_f16 v178, v[vgprValuC+180], v[vgprValuC+181] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+182], v[vgprValuC+182] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+183], v[vgprValuC+183] // convert C to fp16 +v_pack_b32_f16 v179, v[vgprValuC+182], v[vgprValuC+183] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[176:179], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+184], v[vgprValuC+184] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+185], v[vgprValuC+185] // convert C to fp16 +v_pack_b32_f16 v184, v[vgprValuC+184], v[vgprValuC+185] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+186], v[vgprValuC+186] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+187], v[vgprValuC+187] // convert C to fp16 +v_pack_b32_f16 v185, v[vgprValuC+186], v[vgprValuC+187] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+188], v[vgprValuC+188] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+189], v[vgprValuC+189] // convert C to fp16 +v_pack_b32_f16 v186, v[vgprValuC+188], v[vgprValuC+189] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+190], v[vgprValuC+190] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+191], v[vgprValuC+191] // convert C to fp16 +v_pack_b32_f16 v187, v[vgprValuC+190], v[vgprValuC+191] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[184:187], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+192], v[vgprValuC+192] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+193], v[vgprValuC+193] // convert C to fp16 +v_pack_b32_f16 v192, v[vgprValuC+192], v[vgprValuC+193] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+194], v[vgprValuC+194] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+195], v[vgprValuC+195] // convert C to fp16 +v_pack_b32_f16 v193, v[vgprValuC+194], v[vgprValuC+195] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+196], v[vgprValuC+196] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+197], v[vgprValuC+197] // convert C to fp16 +v_pack_b32_f16 v194, v[vgprValuC+196], v[vgprValuC+197] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+198], v[vgprValuC+198] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+199], v[vgprValuC+199] // convert C to fp16 +v_pack_b32_f16 v195, v[vgprValuC+198], v[vgprValuC+199] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[192:195], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+200], v[vgprValuC+200] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+201], v[vgprValuC+201] // convert C to fp16 +v_pack_b32_f16 v200, v[vgprValuC+200], v[vgprValuC+201] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+202], v[vgprValuC+202] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+203], v[vgprValuC+203] // convert C to fp16 +v_pack_b32_f16 v201, v[vgprValuC+202], v[vgprValuC+203] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+204], v[vgprValuC+204] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+205], v[vgprValuC+205] // convert C to fp16 +v_pack_b32_f16 v202, v[vgprValuC+204], v[vgprValuC+205] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+206], v[vgprValuC+206] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+207], v[vgprValuC+207] // convert C to fp16 +v_pack_b32_f16 v203, v[vgprValuC+206], v[vgprValuC+207] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[200:203], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+208], v[vgprValuC+208] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+209], v[vgprValuC+209] // convert C to fp16 +v_pack_b32_f16 v208, v[vgprValuC+208], v[vgprValuC+209] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+210], v[vgprValuC+210] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+211], v[vgprValuC+211] // convert C to fp16 +v_pack_b32_f16 v209, v[vgprValuC+210], v[vgprValuC+211] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+212], v[vgprValuC+212] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+213], v[vgprValuC+213] // convert C to fp16 +v_pack_b32_f16 v210, v[vgprValuC+212], v[vgprValuC+213] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+214], v[vgprValuC+214] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+215], v[vgprValuC+215] // convert C to fp16 +v_pack_b32_f16 v211, v[vgprValuC+214], v[vgprValuC+215] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[208:211], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+216], v[vgprValuC+216] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+217], v[vgprValuC+217] // convert C to fp16 +v_pack_b32_f16 v216, v[vgprValuC+216], v[vgprValuC+217] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+218], v[vgprValuC+218] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+219], v[vgprValuC+219] // convert C to fp16 +v_pack_b32_f16 v217, v[vgprValuC+218], v[vgprValuC+219] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+220], v[vgprValuC+220] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+221], v[vgprValuC+221] // convert C to fp16 +v_pack_b32_f16 v218, v[vgprValuC+220], v[vgprValuC+221] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+222], v[vgprValuC+222] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+223], v[vgprValuC+223] // convert C to fp16 +v_pack_b32_f16 v219, v[vgprValuC+222], v[vgprValuC+223] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[216:219], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+224], v[vgprValuC+224] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+225], v[vgprValuC+225] // convert C to fp16 +v_pack_b32_f16 v224, v[vgprValuC+224], v[vgprValuC+225] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+226], v[vgprValuC+226] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+227], v[vgprValuC+227] // convert C to fp16 +v_pack_b32_f16 v225, v[vgprValuC+226], v[vgprValuC+227] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+228], v[vgprValuC+228] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+229], v[vgprValuC+229] // convert C to fp16 +v_pack_b32_f16 v226, v[vgprValuC+228], v[vgprValuC+229] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+230], v[vgprValuC+230] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+231], v[vgprValuC+231] // convert C to fp16 +v_pack_b32_f16 v227, v[vgprValuC+230], v[vgprValuC+231] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[224:227], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+232], v[vgprValuC+232] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+233], v[vgprValuC+233] // convert C to fp16 +v_pack_b32_f16 v232, v[vgprValuC+232], v[vgprValuC+233] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+234], v[vgprValuC+234] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+235], v[vgprValuC+235] // convert C to fp16 +v_pack_b32_f16 v233, v[vgprValuC+234], v[vgprValuC+235] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+236], v[vgprValuC+236] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+237], v[vgprValuC+237] // convert C to fp16 +v_pack_b32_f16 v234, v[vgprValuC+236], v[vgprValuC+237] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+238], v[vgprValuC+238] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+239], v[vgprValuC+239] // convert C to fp16 +v_pack_b32_f16 v235, v[vgprValuC+238], v[vgprValuC+239] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[232:235], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+240], v[vgprValuC+240] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+241], v[vgprValuC+241] // convert C to fp16 +v_pack_b32_f16 v240, v[vgprValuC+240], v[vgprValuC+241] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+242], v[vgprValuC+242] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+243], v[vgprValuC+243] // convert C to fp16 +v_pack_b32_f16 v241, v[vgprValuC+242], v[vgprValuC+243] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+244], v[vgprValuC+244] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+245], v[vgprValuC+245] // convert C to fp16 +v_pack_b32_f16 v242, v[vgprValuC+244], v[vgprValuC+245] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+246], v[vgprValuC+246] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+247], v[vgprValuC+247] // convert C to fp16 +v_pack_b32_f16 v243, v[vgprValuC+246], v[vgprValuC+247] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[240:243], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_accvgpr_read_b32 v[vgprValuC+16], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+20], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+21], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+22], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+23], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+24], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+25], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+26], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+27], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+28], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+29], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+30], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+31], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+32], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+33], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+34], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+35], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+36], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+37], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+38], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+39], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+40], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+41], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+42], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+43], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+44], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+45], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+46], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+47], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B0_E1_N_1: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v11, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v12, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v12, v10, v12, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v13, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v13, v10, v13, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v14, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v14, v10, v14, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+168], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+169], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+170], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+171], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+172], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+173], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+174], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+175], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+176], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+177], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+178], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+179], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+180], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+181], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+182], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+183], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+184], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+185], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+186], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+187], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+188], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+189], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+190], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+191], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+192], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+193], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+194], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+195], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+196], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+197], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+198], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+199], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+200], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+201], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+202], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+203], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+204], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+205], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+206], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+207], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+208], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+209], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+210], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+211], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+212], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+213], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+214], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+215], acc254 // copy acc to vreg[191] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +buffer_store_dwordx4 v[24:27], v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +buffer_store_dwordx4 v[32:35], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +buffer_store_dwordx4 v[40:43], v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +buffer_store_dwordx4 v[56:59], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +buffer_store_dwordx4 v[64:67], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +buffer_store_dwordx4 v[72:75], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +buffer_store_dwordx4 v[80:83], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +buffer_store_dwordx4 v[88:91], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +buffer_store_dwordx4 v[96:99], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +buffer_store_dwordx4 v[104:107], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +buffer_store_dwordx4 v[112:115], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +buffer_store_dwordx4 v[120:123], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+136], v[vgprValuC+136] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+137], v[vgprValuC+137] // convert C to fp16 +v_pack_b32_f16 v136, v[vgprValuC+136], v[vgprValuC+137] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+138], v[vgprValuC+138] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+139], v[vgprValuC+139] // convert C to fp16 +v_pack_b32_f16 v137, v[vgprValuC+138], v[vgprValuC+139] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+140], v[vgprValuC+140] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+141], v[vgprValuC+141] // convert C to fp16 +v_pack_b32_f16 v138, v[vgprValuC+140], v[vgprValuC+141] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+142], v[vgprValuC+142] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+143], v[vgprValuC+143] // convert C to fp16 +v_pack_b32_f16 v139, v[vgprValuC+142], v[vgprValuC+143] // Pack with neighbor +buffer_store_dwordx4 v[136:139], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+144], v[vgprValuC+144] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+145], v[vgprValuC+145] // convert C to fp16 +v_pack_b32_f16 v144, v[vgprValuC+144], v[vgprValuC+145] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+146], v[vgprValuC+146] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+147], v[vgprValuC+147] // convert C to fp16 +v_pack_b32_f16 v145, v[vgprValuC+146], v[vgprValuC+147] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+148], v[vgprValuC+148] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+149], v[vgprValuC+149] // convert C to fp16 +v_pack_b32_f16 v146, v[vgprValuC+148], v[vgprValuC+149] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+150], v[vgprValuC+150] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+151], v[vgprValuC+151] // convert C to fp16 +v_pack_b32_f16 v147, v[vgprValuC+150], v[vgprValuC+151] // Pack with neighbor +buffer_store_dwordx4 v[144:147], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+152], v[vgprValuC+152] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+153], v[vgprValuC+153] // convert C to fp16 +v_pack_b32_f16 v152, v[vgprValuC+152], v[vgprValuC+153] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+154], v[vgprValuC+154] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+155], v[vgprValuC+155] // convert C to fp16 +v_pack_b32_f16 v153, v[vgprValuC+154], v[vgprValuC+155] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+156], v[vgprValuC+156] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+157], v[vgprValuC+157] // convert C to fp16 +v_pack_b32_f16 v154, v[vgprValuC+156], v[vgprValuC+157] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+158], v[vgprValuC+158] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+159], v[vgprValuC+159] // convert C to fp16 +v_pack_b32_f16 v155, v[vgprValuC+158], v[vgprValuC+159] // Pack with neighbor +buffer_store_dwordx4 v[152:155], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+160], v[vgprValuC+160] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+161], v[vgprValuC+161] // convert C to fp16 +v_pack_b32_f16 v160, v[vgprValuC+160], v[vgprValuC+161] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+162], v[vgprValuC+162] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+163], v[vgprValuC+163] // convert C to fp16 +v_pack_b32_f16 v161, v[vgprValuC+162], v[vgprValuC+163] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+164], v[vgprValuC+164] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+165], v[vgprValuC+165] // convert C to fp16 +v_pack_b32_f16 v162, v[vgprValuC+164], v[vgprValuC+165] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+166], v[vgprValuC+166] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+167], v[vgprValuC+167] // convert C to fp16 +v_pack_b32_f16 v163, v[vgprValuC+166], v[vgprValuC+167] // Pack with neighbor +buffer_store_dwordx4 v[160:163], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+168], v[vgprValuC+168] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+169], v[vgprValuC+169] // convert C to fp16 +v_pack_b32_f16 v168, v[vgprValuC+168], v[vgprValuC+169] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+170], v[vgprValuC+170] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+171], v[vgprValuC+171] // convert C to fp16 +v_pack_b32_f16 v169, v[vgprValuC+170], v[vgprValuC+171] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+172], v[vgprValuC+172] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+173], v[vgprValuC+173] // convert C to fp16 +v_pack_b32_f16 v170, v[vgprValuC+172], v[vgprValuC+173] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+174], v[vgprValuC+174] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+175], v[vgprValuC+175] // convert C to fp16 +v_pack_b32_f16 v171, v[vgprValuC+174], v[vgprValuC+175] // Pack with neighbor +buffer_store_dwordx4 v[168:171], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+176], v[vgprValuC+176] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+177], v[vgprValuC+177] // convert C to fp16 +v_pack_b32_f16 v176, v[vgprValuC+176], v[vgprValuC+177] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+178], v[vgprValuC+178] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+179], v[vgprValuC+179] // convert C to fp16 +v_pack_b32_f16 v177, v[vgprValuC+178], v[vgprValuC+179] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+180], v[vgprValuC+180] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+181], v[vgprValuC+181] // convert C to fp16 +v_pack_b32_f16 v178, v[vgprValuC+180], v[vgprValuC+181] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+182], v[vgprValuC+182] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+183], v[vgprValuC+183] // convert C to fp16 +v_pack_b32_f16 v179, v[vgprValuC+182], v[vgprValuC+183] // Pack with neighbor +buffer_store_dwordx4 v[176:179], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+184], v[vgprValuC+184] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+185], v[vgprValuC+185] // convert C to fp16 +v_pack_b32_f16 v184, v[vgprValuC+184], v[vgprValuC+185] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+186], v[vgprValuC+186] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+187], v[vgprValuC+187] // convert C to fp16 +v_pack_b32_f16 v185, v[vgprValuC+186], v[vgprValuC+187] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+188], v[vgprValuC+188] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+189], v[vgprValuC+189] // convert C to fp16 +v_pack_b32_f16 v186, v[vgprValuC+188], v[vgprValuC+189] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+190], v[vgprValuC+190] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+191], v[vgprValuC+191] // convert C to fp16 +v_pack_b32_f16 v187, v[vgprValuC+190], v[vgprValuC+191] // Pack with neighbor +buffer_store_dwordx4 v[184:187], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+192], v[vgprValuC+192] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+193], v[vgprValuC+193] // convert C to fp16 +v_pack_b32_f16 v192, v[vgprValuC+192], v[vgprValuC+193] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+194], v[vgprValuC+194] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+195], v[vgprValuC+195] // convert C to fp16 +v_pack_b32_f16 v193, v[vgprValuC+194], v[vgprValuC+195] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+196], v[vgprValuC+196] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+197], v[vgprValuC+197] // convert C to fp16 +v_pack_b32_f16 v194, v[vgprValuC+196], v[vgprValuC+197] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+198], v[vgprValuC+198] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+199], v[vgprValuC+199] // convert C to fp16 +v_pack_b32_f16 v195, v[vgprValuC+198], v[vgprValuC+199] // Pack with neighbor +buffer_store_dwordx4 v[192:195], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+200], v[vgprValuC+200] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+201], v[vgprValuC+201] // convert C to fp16 +v_pack_b32_f16 v200, v[vgprValuC+200], v[vgprValuC+201] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+202], v[vgprValuC+202] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+203], v[vgprValuC+203] // convert C to fp16 +v_pack_b32_f16 v201, v[vgprValuC+202], v[vgprValuC+203] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+204], v[vgprValuC+204] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+205], v[vgprValuC+205] // convert C to fp16 +v_pack_b32_f16 v202, v[vgprValuC+204], v[vgprValuC+205] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+206], v[vgprValuC+206] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+207], v[vgprValuC+207] // convert C to fp16 +v_pack_b32_f16 v203, v[vgprValuC+206], v[vgprValuC+207] // Pack with neighbor +buffer_store_dwordx4 v[200:203], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+208], v[vgprValuC+208] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+209], v[vgprValuC+209] // convert C to fp16 +v_pack_b32_f16 v208, v[vgprValuC+208], v[vgprValuC+209] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+210], v[vgprValuC+210] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+211], v[vgprValuC+211] // convert C to fp16 +v_pack_b32_f16 v209, v[vgprValuC+210], v[vgprValuC+211] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+212], v[vgprValuC+212] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+213], v[vgprValuC+213] // convert C to fp16 +v_pack_b32_f16 v210, v[vgprValuC+212], v[vgprValuC+213] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+214], v[vgprValuC+214] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+215], v[vgprValuC+215] // convert C to fp16 +v_pack_b32_f16 v211, v[vgprValuC+214], v[vgprValuC+215] // Pack with neighbor +buffer_store_dwordx4 v[208:211], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v11, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v12, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v12, v10, v12, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v13, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v13, v10, v13, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v14, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v14, v10, v14, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v15, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v15, v10, v15, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v80, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v80, v10, v80, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v81, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v81, v10, v81, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v82, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v82, v10, v82, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+17], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+18], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+19], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+20], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+21], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+22], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+23], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+24], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+25], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+26], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+27], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+28], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+29], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+30], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+31], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+32], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+33], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+34], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+35], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+36], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+37], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+38], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+39], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+40], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+41], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+42], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+43], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+44], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+45], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+46], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+47], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+48], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+49], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+50], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+51], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+52], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+53], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+54], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+55], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+56], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+57], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+58], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+59], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+60], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+61], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+62], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+63], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+64], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+65], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+66], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+67], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+68], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+69], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+70], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+71], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+72], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+73], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+74], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+75], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+76], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+77], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+78], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+79], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +buffer_store_dwordx4 v[24:27], v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +buffer_store_dwordx4 v[32:35], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +buffer_store_dwordx4 v[40:43], v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +buffer_store_dwordx4 v[56:59], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +buffer_store_dwordx4 v[64:67], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +buffer_store_dwordx4 v[72:75], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B0_E1_M_1: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=116 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v127, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v127, v10, v127, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+12], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+13], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+14], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+15], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+16], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+17], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+18], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+19], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+20], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+21], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+22], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+23], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+24], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+25], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+26], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+27], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+28], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+29], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+30], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+31], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+32], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+33], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+34], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+35], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+36], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+37], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+38], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+39], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+40], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+41], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+42], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+43], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+44], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+45], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+46], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+47], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+48], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+49], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+50], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+51], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+52], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+53], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+54], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+55], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+56], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+57], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+58], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+59], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+60], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+61], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+62], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+63], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+64], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+65], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+66], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+67], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+68], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+69], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+70], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+71], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+72], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+73], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+74], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+75], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+76], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+77], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+78], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+79], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+80], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+81], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+82], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+83], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+84], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+85], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+86], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+87], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+88], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+89], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+90], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+91], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+92], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+93], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+94], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+95], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+96], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+97], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+98], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+99], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+100], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+101], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+102], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+103], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+104], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+105], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+106], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+107], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+108], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+109], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+110], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+111], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+112], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+113], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+114], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+115], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+116], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+117], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+118], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+119], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+120], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+121], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+122], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+123], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+124], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+125], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+126], acc205 // copy acc to vreg[115] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+126], s[sgprAlpha], v[vgprValuC+126] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 +buffer_store_short v33, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 +buffer_store_short v34, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 +buffer_store_short v35, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 +buffer_store_short v36, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 +buffer_store_short v37, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 +buffer_store_short v38, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 +buffer_store_short v39, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 +buffer_store_short v40, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 +buffer_store_short v41, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 +buffer_store_short v42, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 +buffer_store_short v43, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 +buffer_store_short v44, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 +buffer_store_short v45, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 +buffer_store_short v46, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 +buffer_store_short v47, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 +buffer_store_short v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 +buffer_store_short v49, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 +buffer_store_short v50, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 +buffer_store_short v51, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 +buffer_store_short v52, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 +buffer_store_short v53, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 +buffer_store_short v55, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 +buffer_store_short v56, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 +buffer_store_short v57, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 +buffer_store_short v58, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 +buffer_store_short v59, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 +buffer_store_short v60, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v61, v[vgprValuC+61] // convert C to fp16 +buffer_store_short v61, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v62, v[vgprValuC+62] // convert C to fp16 +buffer_store_short v62, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v63, v[vgprValuC+63] // convert C to fp16 +buffer_store_short v63, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v64, v[vgprValuC+64] // convert C to fp16 +buffer_store_short v64, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v65, v[vgprValuC+65] // convert C to fp16 +buffer_store_short v65, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v66, v[vgprValuC+66] // convert C to fp16 +buffer_store_short v66, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v67, v[vgprValuC+67] // convert C to fp16 +buffer_store_short v67, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v68, v[vgprValuC+68] // convert C to fp16 +buffer_store_short v68, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v69, v[vgprValuC+69] // convert C to fp16 +buffer_store_short v69, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v70, v[vgprValuC+70] // convert C to fp16 +buffer_store_short v70, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v71, v[vgprValuC+71] // convert C to fp16 +buffer_store_short v71, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v72, v[vgprValuC+72] // convert C to fp16 +buffer_store_short v72, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v73, v[vgprValuC+73] // convert C to fp16 +buffer_store_short v73, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v74, v[vgprValuC+74] // convert C to fp16 +buffer_store_short v74, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v75, v[vgprValuC+75] // convert C to fp16 +buffer_store_short v75, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v76, v[vgprValuC+76] // convert C to fp16 +buffer_store_short v76, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v77, v[vgprValuC+77] // convert C to fp16 +buffer_store_short v77, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v78, v[vgprValuC+78] // convert C to fp16 +buffer_store_short v78, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v79, v[vgprValuC+79] // convert C to fp16 +buffer_store_short v79, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v80, v[vgprValuC+80] // convert C to fp16 +buffer_store_short v80, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v81, v[vgprValuC+81] // convert C to fp16 +buffer_store_short v81, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v82, v[vgprValuC+82] // convert C to fp16 +buffer_store_short v82, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v83, v[vgprValuC+83] // convert C to fp16 +buffer_store_short v83, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v84, v[vgprValuC+84] // convert C to fp16 +buffer_store_short v84, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v85, v[vgprValuC+85] // convert C to fp16 +buffer_store_short v85, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v86, v[vgprValuC+86] // convert C to fp16 +buffer_store_short v86, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v87, v[vgprValuC+87] // convert C to fp16 +buffer_store_short v87, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v88, v[vgprValuC+88] // convert C to fp16 +buffer_store_short v88, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v89, v[vgprValuC+89] // convert C to fp16 +buffer_store_short v89, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v90, v[vgprValuC+90] // convert C to fp16 +buffer_store_short v90, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v91, v[vgprValuC+91] // convert C to fp16 +buffer_store_short v91, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v92, v[vgprValuC+92] // convert C to fp16 +buffer_store_short v92, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v93, v[vgprValuC+93] // convert C to fp16 +buffer_store_short v93, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v94, v[vgprValuC+94] // convert C to fp16 +buffer_store_short v94, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v95, v[vgprValuC+95] // convert C to fp16 +buffer_store_short v95, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v96, v[vgprValuC+96] // convert C to fp16 +buffer_store_short v96, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v97, v[vgprValuC+97] // convert C to fp16 +buffer_store_short v97, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v98, v[vgprValuC+98] // convert C to fp16 +buffer_store_short v98, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v99, v[vgprValuC+99] // convert C to fp16 +buffer_store_short v99, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v100, v[vgprValuC+100] // convert C to fp16 +buffer_store_short v100, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v101, v[vgprValuC+101] // convert C to fp16 +buffer_store_short v101, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v102, v[vgprValuC+102] // convert C to fp16 +buffer_store_short v102, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v103, v[vgprValuC+103] // convert C to fp16 +buffer_store_short v103, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v104, v[vgprValuC+104] // convert C to fp16 +buffer_store_short v104, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v105, v[vgprValuC+105] // convert C to fp16 +buffer_store_short v105, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v106, v[vgprValuC+106] // convert C to fp16 +buffer_store_short v106, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v107, v[vgprValuC+107] // convert C to fp16 +buffer_store_short v107, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v108, v[vgprValuC+108] // convert C to fp16 +buffer_store_short v108, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v109, v[vgprValuC+109] // convert C to fp16 +buffer_store_short v109, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v110, v[vgprValuC+110] // convert C to fp16 +buffer_store_short v110, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v111, v[vgprValuC+111] // convert C to fp16 +buffer_store_short v111, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v112, v[vgprValuC+112] // convert C to fp16 +buffer_store_short v112, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v113, v[vgprValuC+113] // convert C to fp16 +buffer_store_short v113, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v114, v[vgprValuC+114] // convert C to fp16 +buffer_store_short v114, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v115, v[vgprValuC+115] // convert C to fp16 +buffer_store_short v115, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v116, v[vgprValuC+116] // convert C to fp16 +buffer_store_short v116, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v117, v[vgprValuC+117] // convert C to fp16 +buffer_store_short v117, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v118, v[vgprValuC+118] // convert C to fp16 +buffer_store_short v118, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v119, v[vgprValuC+119] // convert C to fp16 +buffer_store_short v119, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v120, v[vgprValuC+120] // convert C to fp16 +buffer_store_short v120, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v121, v[vgprValuC+121] // convert C to fp16 +buffer_store_short v121, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v122, v[vgprValuC+122] // convert C to fp16 +buffer_store_short v122, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v123, v[vgprValuC+123] // convert C to fp16 +buffer_store_short v123, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v124, v[vgprValuC+124] // convert C to fp16 +buffer_store_short v124, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v125, v[vgprValuC+125] // convert C to fp16 +buffer_store_short v125, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v126, v[vgprValuC+126] // convert C to fp16 +buffer_store_short v126, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1); (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v127, v10, v127, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v129, v10, v129, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v131, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v131, v10, v131, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v136, v10, v136, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v138, v10, v138, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v140, v10, v140, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v142, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v142, v10, v142, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v144, v10, v144, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v146, v10, v146, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v148, v10, v148, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v150, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v150, v10, v150, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v152, v10, v152, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v154, v10, v154, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v160, v10, v160, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v162, v10, v162, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v164, v10, v164, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v166, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v166, v10, v166, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v168, v10, v168, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v170, v10, v170, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v172, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v172, v10, v172, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v174, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v174, v10, v174, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v180, v10, v180, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v182, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v182, v10, v182, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v184, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v184, v10, v184, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v186, v10, v186, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v188, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v188, v10, v188, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v190, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v190, v10, v190, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v192, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v192, v10, v192, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v194, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v194, v10, v194, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v200, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v200, v10, v200, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v202, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v202, v10, v202, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v204, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v204, v10, v204, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v206, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v206, v10, v206, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v208, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v208, v10, v208, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v210, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v210, v10, v210, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v212, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v212, v10, v212, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v214, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v214, v10, v214, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v218, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v218, v10, v218, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v220, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v220, v10, v220, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v222, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v222, v10, v222, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v224, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v224, v10, v224, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v226, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v226, v10, v226, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v228, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v228, v10, v228, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v230, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v230, v10, v230, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v232, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v232, v10, v232, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v234, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v234, v10, v234, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v236, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v236, v10, v236, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v238, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v238, v10, v238, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v240, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v240, v10, v240, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v242, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v242, v10, v242, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v244, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v244, v10, v244, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+12], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+13], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+14], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+15], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+16], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+17], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+18], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+19], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+20], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+21], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+22], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+23], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+24], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+25], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+26], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+27], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+28], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+29], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+30], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+31], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+32], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+33], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+34], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+35], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+36], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+37], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+38], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+39], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+40], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+41], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+42], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+43], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+44], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+45], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+46], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+47], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+48], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+49], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+50], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+51], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+52], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+53], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+54], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+55], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+56], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+57], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+58], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+59], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+60], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+61], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+62], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+63], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+64], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+65], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+66], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+67], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+68], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+69], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+70], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+71], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+72], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+73], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+74], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+75], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+76], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+77], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+78], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+79], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+80], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+81], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+82], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+83], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+84], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+85], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+86], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+87], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+88], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+89], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+90], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+91], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+92], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+93], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+94], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+95], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+96], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+97], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+98], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+99], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+100], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+101], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+102], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+103], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+104], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+105], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+106], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+107], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+108], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+109], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+110], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+111], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+112], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+113], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+114], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+115], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+116], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+117], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+118], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+119], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+120], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+121], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+122], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+123], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+124], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+125], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+126], acc159 // copy acc to vreg[231] + +/* rC *= alpha batchElements=[(0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3), (0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+126], s[sgprAlpha], v[vgprValuC+126] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 +buffer_store_short v33, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 +buffer_store_short v34, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 +buffer_store_short v35, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 +buffer_store_short v36, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 +buffer_store_short v37, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 +buffer_store_short v38, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 +buffer_store_short v39, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 +buffer_store_short v40, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 +buffer_store_short v41, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 +buffer_store_short v42, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 +buffer_store_short v43, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 +buffer_store_short v44, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 +buffer_store_short v45, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 +buffer_store_short v46, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 +buffer_store_short v47, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 +buffer_store_short v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 +buffer_store_short v49, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 +buffer_store_short v50, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 +buffer_store_short v51, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 +buffer_store_short v52, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 +buffer_store_short v53, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 +buffer_store_short v54, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 +buffer_store_short v55, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 +buffer_store_short v56, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 +buffer_store_short v57, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 +buffer_store_short v58, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 +buffer_store_short v59, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 +buffer_store_short v60, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v61, v[vgprValuC+61] // convert C to fp16 +buffer_store_short v61, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v62, v[vgprValuC+62] // convert C to fp16 +buffer_store_short v62, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v63, v[vgprValuC+63] // convert C to fp16 +buffer_store_short v63, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v64, v[vgprValuC+64] // convert C to fp16 +buffer_store_short v64, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v65, v[vgprValuC+65] // convert C to fp16 +buffer_store_short v65, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v66, v[vgprValuC+66] // convert C to fp16 +buffer_store_short v66, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v67, v[vgprValuC+67] // convert C to fp16 +buffer_store_short v67, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v68, v[vgprValuC+68] // convert C to fp16 +buffer_store_short v68, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v69, v[vgprValuC+69] // convert C to fp16 +buffer_store_short v69, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v70, v[vgprValuC+70] // convert C to fp16 +buffer_store_short v70, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v71, v[vgprValuC+71] // convert C to fp16 +buffer_store_short v71, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v72, v[vgprValuC+72] // convert C to fp16 +buffer_store_short v72, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v73, v[vgprValuC+73] // convert C to fp16 +buffer_store_short v73, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v74, v[vgprValuC+74] // convert C to fp16 +buffer_store_short v74, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v75, v[vgprValuC+75] // convert C to fp16 +buffer_store_short v75, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v76, v[vgprValuC+76] // convert C to fp16 +buffer_store_short v76, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v77, v[vgprValuC+77] // convert C to fp16 +buffer_store_short v77, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v78, v[vgprValuC+78] // convert C to fp16 +buffer_store_short v78, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v79, v[vgprValuC+79] // convert C to fp16 +buffer_store_short v79, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v80, v[vgprValuC+80] // convert C to fp16 +buffer_store_short v80, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v81, v[vgprValuC+81] // convert C to fp16 +buffer_store_short v81, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v82, v[vgprValuC+82] // convert C to fp16 +buffer_store_short v82, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v83, v[vgprValuC+83] // convert C to fp16 +buffer_store_short v83, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v84, v[vgprValuC+84] // convert C to fp16 +buffer_store_short v84, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v85, v[vgprValuC+85] // convert C to fp16 +buffer_store_short v85, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v86, v[vgprValuC+86] // convert C to fp16 +buffer_store_short v86, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v87, v[vgprValuC+87] // convert C to fp16 +buffer_store_short v87, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v88, v[vgprValuC+88] // convert C to fp16 +buffer_store_short v88, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v89, v[vgprValuC+89] // convert C to fp16 +buffer_store_short v89, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v90, v[vgprValuC+90] // convert C to fp16 +buffer_store_short v90, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v91, v[vgprValuC+91] // convert C to fp16 +buffer_store_short v91, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v92, v[vgprValuC+92] // convert C to fp16 +buffer_store_short v92, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v93, v[vgprValuC+93] // convert C to fp16 +buffer_store_short v93, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v94, v[vgprValuC+94] // convert C to fp16 +buffer_store_short v94, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v95, v[vgprValuC+95] // convert C to fp16 +buffer_store_short v95, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v96, v[vgprValuC+96] // convert C to fp16 +buffer_store_short v96, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v97, v[vgprValuC+97] // convert C to fp16 +buffer_store_short v97, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v98, v[vgprValuC+98] // convert C to fp16 +buffer_store_short v98, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v99, v[vgprValuC+99] // convert C to fp16 +buffer_store_short v99, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v100, v[vgprValuC+100] // convert C to fp16 +buffer_store_short v100, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v101, v[vgprValuC+101] // convert C to fp16 +buffer_store_short v101, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v102, v[vgprValuC+102] // convert C to fp16 +buffer_store_short v102, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v103, v[vgprValuC+103] // convert C to fp16 +buffer_store_short v103, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v104, v[vgprValuC+104] // convert C to fp16 +buffer_store_short v104, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v105, v[vgprValuC+105] // convert C to fp16 +buffer_store_short v105, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v106, v[vgprValuC+106] // convert C to fp16 +buffer_store_short v106, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v107, v[vgprValuC+107] // convert C to fp16 +buffer_store_short v107, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v108, v[vgprValuC+108] // convert C to fp16 +buffer_store_short v108, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v109, v[vgprValuC+109] // convert C to fp16 +buffer_store_short v109, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v110, v[vgprValuC+110] // convert C to fp16 +buffer_store_short v110, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v111, v[vgprValuC+111] // convert C to fp16 +buffer_store_short v111, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v112, v[vgprValuC+112] // convert C to fp16 +buffer_store_short v112, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v113, v[vgprValuC+113] // convert C to fp16 +buffer_store_short v113, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v114, v[vgprValuC+114] // convert C to fp16 +buffer_store_short v114, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v115, v[vgprValuC+115] // convert C to fp16 +buffer_store_short v115, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v116, v[vgprValuC+116] // convert C to fp16 +buffer_store_short v116, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v117, v[vgprValuC+117] // convert C to fp16 +buffer_store_short v117, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v118, v[vgprValuC+118] // convert C to fp16 +buffer_store_short v118, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v119, v[vgprValuC+119] // convert C to fp16 +buffer_store_short v119, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v120, v[vgprValuC+120] // convert C to fp16 +buffer_store_short v120, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v121, v[vgprValuC+121] // convert C to fp16 +buffer_store_short v121, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v122, v[vgprValuC+122] // convert C to fp16 +buffer_store_short v122, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v123, v[vgprValuC+123] // convert C to fp16 +buffer_store_short v123, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v124, v[vgprValuC+124] // convert C to fp16 +buffer_store_short v124, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v125, v[vgprValuC+125] // convert C to fp16 +buffer_store_short v125, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v126, v[vgprValuC+126] // convert C to fp16 +buffer_store_short v126, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v35, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v35, v10, v35, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v36, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v36, v10, v36, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v37, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v37, v10, v37, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v38, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v38, v10, v38, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v39, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v39, v10, v39, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v40, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v40, v10, v40, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v41, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v41, v10, v41, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v42, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v42, v10, v42, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v43, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v43, v10, v43, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v45, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v45, v10, v45, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v47, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v47, v10, v47, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v49, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v49, v10, v49, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v51, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v51, v10, v51, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v53, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v53, v10, v53, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v55, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v55, v10, v55, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v57, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v57, v10, v57, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+12], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+13], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+14], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+15], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+16], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+17], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+18], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+19], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+20], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+21], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+22], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+23], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+24], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+25], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+26], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+27], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+28], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+29], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+30], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+31], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+32], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+33], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+34], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+34], s[sgprAlpha], v[vgprValuC+34] // *= alpha + +/* apply mask, calc new C and issue writes */ +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v35, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v37, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v39, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v41, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 +buffer_store_short v33, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 +buffer_store_short v34, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_Beta_2: +s_and_b32 s30, 255, s[sgprSizeI] // s30 = s[sgprSizeI] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s31 // wg0 >= nwg0-1 ? +s_cselect_b32 s30, s30, 0 // set rMT0 +s_cmpk_gt_u32 s30, 0 // rMT0 > 0 +s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required +s_and_b32 s30, 255, s[sgprSizeJ] // s30 = s[sgprSizeJ] % 256 +s_add_u32 s31, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s31 // wg1 >= nwg1-1 +s_cselect_b32 s30, s30, 0 // set rMT1 +s_cmpk_gt_u32 s30, 0 // rMT1 > 0 +s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required +label_GW_B1_E0: + +/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=18 */ +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_add_lshl_u32 v12, v6, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +buffer_load_dwordx4 v[128:131], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[168:171], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[172:175], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[176:179], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[180:183], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[184:187], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[188:191], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[192:195], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[196:199], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[200:203], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[204:207], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[208:211], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[212:215], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[216:219], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[220:223], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[224:227], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[228:231], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[232:235], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v11, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+152], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+153], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+154], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+155], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+156], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+157], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+158], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+159], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+160], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+161], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+162], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+163], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+164], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+165], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+166], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+167], acc62 // copy acc to vreg[143] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ + +s_waitcnt vmcnt(17) // vmcnt(17) = 18 - 1 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v128, v[vgprValuC+16] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v128, v[vgprValuC+17] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v129, v[vgprValuC+18] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v129, v[vgprValuC+19] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v130, v[vgprValuC+20] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v130, v[vgprValuC+21] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v131, v[vgprValuC+22] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v131, v[vgprValuC+23] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(16) = 18 - 2 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v168, v[vgprValuC+24] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v168, v[vgprValuC+25] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v169, v[vgprValuC+26] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v169, v[vgprValuC+27] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v170, v[vgprValuC+28] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v170, v[vgprValuC+29] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v171, v[vgprValuC+30] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v171, v[vgprValuC+31] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(15) = 18 - 3 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v172, v[vgprValuC+32] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v172, v[vgprValuC+33] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v173, v[vgprValuC+34] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v173, v[vgprValuC+35] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v174, v[vgprValuC+36] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v174, v[vgprValuC+37] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v175, v[vgprValuC+38] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v175, v[vgprValuC+39] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(14) = 18 - 4 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v176, v[vgprValuC+40] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v176, v[vgprValuC+41] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v177, v[vgprValuC+42] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v177, v[vgprValuC+43] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v178, v[vgprValuC+44] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v178, v[vgprValuC+45] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v179, v[vgprValuC+46] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v179, v[vgprValuC+47] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(13) = 18 - 5 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v180, v[vgprValuC+48] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v180, v[vgprValuC+49] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v181, v[vgprValuC+50] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v181, v[vgprValuC+51] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v182, v[vgprValuC+52] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v183, v[vgprValuC+54] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v183, v[vgprValuC+55] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(12) = 18 - 6 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v184, v[vgprValuC+56] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v184, v[vgprValuC+57] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v185, v[vgprValuC+58] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v185, v[vgprValuC+59] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v186, v[vgprValuC+60] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v186, v[vgprValuC+61] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v187, v[vgprValuC+62] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v187, v[vgprValuC+63] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(11) = 18 - 7 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v188, v[vgprValuC+64] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v188, v[vgprValuC+65] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v189, v[vgprValuC+66] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v189, v[vgprValuC+67] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v190, v[vgprValuC+68] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v190, v[vgprValuC+69] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v191, v[vgprValuC+70] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v191, v[vgprValuC+71] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(10) = 18 - 8 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v192, v[vgprValuC+72] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v192, v[vgprValuC+73] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v193, v[vgprValuC+74] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v193, v[vgprValuC+75] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v194, v[vgprValuC+76] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v194, v[vgprValuC+77] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v195, v[vgprValuC+78] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v195, v[vgprValuC+79] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(9) = 18 - 9 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v196, v[vgprValuC+80] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v196, v[vgprValuC+81] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v197, v[vgprValuC+82] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v197, v[vgprValuC+83] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v198, v[vgprValuC+84] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v198, v[vgprValuC+85] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v199, v[vgprValuC+86] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v199, v[vgprValuC+87] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(8) = 18 - 10 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v200, v[vgprValuC+88] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+89], s[sgprBeta], v200, v[vgprValuC+89] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+90], s[sgprBeta], v201, v[vgprValuC+90] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+91], s[sgprBeta], v201, v[vgprValuC+91] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+92], s[sgprBeta], v202, v[vgprValuC+92] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+93], s[sgprBeta], v202, v[vgprValuC+93] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+94], s[sgprBeta], v203, v[vgprValuC+94] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+95], s[sgprBeta], v203, v[vgprValuC+95] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(7) = 18 - 11 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+96], s[sgprBeta], v204, v[vgprValuC+96] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+97], s[sgprBeta], v204, v[vgprValuC+97] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+98], s[sgprBeta], v205, v[vgprValuC+98] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+99], s[sgprBeta], v205, v[vgprValuC+99] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+100], s[sgprBeta], v206, v[vgprValuC+100] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+101], s[sgprBeta], v206, v[vgprValuC+101] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+102], s[sgprBeta], v207, v[vgprValuC+102] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+103], s[sgprBeta], v207, v[vgprValuC+103] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(6) = 18 - 12 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+104], s[sgprBeta], v208, v[vgprValuC+104] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+105], s[sgprBeta], v208, v[vgprValuC+105] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+106], s[sgprBeta], v209, v[vgprValuC+106] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+107], s[sgprBeta], v209, v[vgprValuC+107] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+108], s[sgprBeta], v210, v[vgprValuC+108] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+109], s[sgprBeta], v210, v[vgprValuC+109] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+110], s[sgprBeta], v211, v[vgprValuC+110] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+111], s[sgprBeta], v211, v[vgprValuC+111] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(5) = 18 - 13 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+112], s[sgprBeta], v212, v[vgprValuC+112] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+113], s[sgprBeta], v212, v[vgprValuC+113] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+114], s[sgprBeta], v213, v[vgprValuC+114] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+115], s[sgprBeta], v213, v[vgprValuC+115] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+116], s[sgprBeta], v214, v[vgprValuC+116] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+117], s[sgprBeta], v214, v[vgprValuC+117] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+118], s[sgprBeta], v215, v[vgprValuC+118] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+119], s[sgprBeta], v215, v[vgprValuC+119] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(4) = 18 - 14 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+120], s[sgprBeta], v216, v[vgprValuC+120] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+121], s[sgprBeta], v216, v[vgprValuC+121] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+122], s[sgprBeta], v217, v[vgprValuC+122] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+123], s[sgprBeta], v217, v[vgprValuC+123] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+124], s[sgprBeta], v218, v[vgprValuC+124] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+125], s[sgprBeta], v218, v[vgprValuC+125] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+126], s[sgprBeta], v219, v[vgprValuC+126] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+127], s[sgprBeta], v219, v[vgprValuC+127] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(3) = 18 - 15 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+136], s[sgprBeta], v220, v[vgprValuC+136] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+137], s[sgprBeta], v220, v[vgprValuC+137] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+138], s[sgprBeta], v221, v[vgprValuC+138] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+139], s[sgprBeta], v221, v[vgprValuC+139] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+140], s[sgprBeta], v222, v[vgprValuC+140] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+141], s[sgprBeta], v222, v[vgprValuC+141] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+142], s[sgprBeta], v223, v[vgprValuC+142] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+143], s[sgprBeta], v223, v[vgprValuC+143] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+136], v[vgprValuC+136] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+137], v[vgprValuC+137] // convert C to fp16 +v_pack_b32_f16 v136, v[vgprValuC+136], v[vgprValuC+137] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+138], v[vgprValuC+138] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+139], v[vgprValuC+139] // convert C to fp16 +v_pack_b32_f16 v137, v[vgprValuC+138], v[vgprValuC+139] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+140], v[vgprValuC+140] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+141], v[vgprValuC+141] // convert C to fp16 +v_pack_b32_f16 v138, v[vgprValuC+140], v[vgprValuC+141] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+142], v[vgprValuC+142] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+143], v[vgprValuC+143] // convert C to fp16 +v_pack_b32_f16 v139, v[vgprValuC+142], v[vgprValuC+143] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[136:139], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(2) = 18 - 16 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+144], s[sgprBeta], v224, v[vgprValuC+144] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+145], s[sgprBeta], v224, v[vgprValuC+145] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+146], s[sgprBeta], v225, v[vgprValuC+146] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+147], s[sgprBeta], v225, v[vgprValuC+147] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+148], s[sgprBeta], v226, v[vgprValuC+148] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+149], s[sgprBeta], v226, v[vgprValuC+149] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+150], s[sgprBeta], v227, v[vgprValuC+150] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+151], s[sgprBeta], v227, v[vgprValuC+151] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+144], v[vgprValuC+144] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+145], v[vgprValuC+145] // convert C to fp16 +v_pack_b32_f16 v144, v[vgprValuC+144], v[vgprValuC+145] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+146], v[vgprValuC+146] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+147], v[vgprValuC+147] // convert C to fp16 +v_pack_b32_f16 v145, v[vgprValuC+146], v[vgprValuC+147] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+148], v[vgprValuC+148] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+149], v[vgprValuC+149] // convert C to fp16 +v_pack_b32_f16 v146, v[vgprValuC+148], v[vgprValuC+149] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+150], v[vgprValuC+150] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+151], v[vgprValuC+151] // convert C to fp16 +v_pack_b32_f16 v147, v[vgprValuC+150], v[vgprValuC+151] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[144:147], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(1) = 18 - 17 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+152], s[sgprBeta], v228, v[vgprValuC+152] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+153], s[sgprBeta], v228, v[vgprValuC+153] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+154], s[sgprBeta], v229, v[vgprValuC+154] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+155], s[sgprBeta], v229, v[vgprValuC+155] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+156], s[sgprBeta], v230, v[vgprValuC+156] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+157], s[sgprBeta], v230, v[vgprValuC+157] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+158], s[sgprBeta], v231, v[vgprValuC+158] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+159], s[sgprBeta], v231, v[vgprValuC+159] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+152], v[vgprValuC+152] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+153], v[vgprValuC+153] // convert C to fp16 +v_pack_b32_f16 v152, v[vgprValuC+152], v[vgprValuC+153] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+154], v[vgprValuC+154] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+155], v[vgprValuC+155] // convert C to fp16 +v_pack_b32_f16 v153, v[vgprValuC+154], v[vgprValuC+155] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+156], v[vgprValuC+156] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+157], v[vgprValuC+157] // convert C to fp16 +v_pack_b32_f16 v154, v[vgprValuC+156], v[vgprValuC+157] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+158], v[vgprValuC+158] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+159], v[vgprValuC+159] // convert C to fp16 +v_pack_b32_f16 v155, v[vgprValuC+158], v[vgprValuC+159] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[152:155], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(17) // vmcnt(0) = 18 - 18 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+160], s[sgprBeta], v232, v[vgprValuC+160] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+161], s[sgprBeta], v232, v[vgprValuC+161] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+162], s[sgprBeta], v233, v[vgprValuC+162] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+163], s[sgprBeta], v233, v[vgprValuC+163] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+164], s[sgprBeta], v234, v[vgprValuC+164] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+165], s[sgprBeta], v234, v[vgprValuC+165] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+166], s[sgprBeta], v235, v[vgprValuC+166] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+167], s[sgprBeta], v235, v[vgprValuC+167] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+160], v[vgprValuC+160] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+161], v[vgprValuC+161] // convert C to fp16 +v_pack_b32_f16 v160, v[vgprValuC+160], v[vgprValuC+161] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+162], v[vgprValuC+162] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+163], v[vgprValuC+163] // convert C to fp16 +v_pack_b32_f16 v161, v[vgprValuC+162], v[vgprValuC+163] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+164], v[vgprValuC+164] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+165], v[vgprValuC+165] // convert C to fp16 +v_pack_b32_f16 v162, v[vgprValuC+164], v[vgprValuC+165] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+166], v[vgprValuC+166] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+167], v[vgprValuC+167] // convert C to fp16 +v_pack_b32_f16 v163, v[vgprValuC+166], v[vgprValuC+167] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[160:163], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[128:131], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[136:139], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[140:143], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[144:147], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[148:151], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[152:155], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[156:159], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[160:163], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[164:167], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[168:171], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[172:175], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[176:179], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[180:183], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +s_lshl_b32 s12, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_load_dwordx4 v[184:187], v12, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_accvgpr_read_b32 v[vgprValuC+16], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+17], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+18], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+19], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+20], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+21], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+22], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+23], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+24], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+25], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+26], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+27], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+28], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+29], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+30], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+31], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+32], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+33], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+34], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+35], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+36], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+37], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+38], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+39], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+40], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+41], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+42], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+43], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+44], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+45], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+46], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+47], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+48], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+49], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+50], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+51], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+52], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+53], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+54], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+55], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+56], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+57], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+58], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+59], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+60], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+61], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+62], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+63], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+64], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+65], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+66], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+67], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+68], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+69], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+70], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+71], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+72], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+73], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+74], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+75], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+76], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+77], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+78], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+79], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+80], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+81], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+82], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+83], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+84], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+85], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+86], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+87], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+88], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+89], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+90], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+91], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+92], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+93], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+94], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+95], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+96], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+97], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+98], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+99], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+100], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+101], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+102], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+103], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+104], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+105], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+106], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+107], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+108], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+109], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+110], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+111], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+112], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+113], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+114], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+115], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+116], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+117], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+118], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+119], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+120], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+121], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+122], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+123], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+124], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+125], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+126], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+127], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) + +/* apply mask, calc new C and issue writes */ + +s_waitcnt vmcnt(13) // vmcnt(13) = 14 - 1 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v128, v[vgprValuC+16] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v128, v[vgprValuC+17] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v129, v[vgprValuC+18] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v129, v[vgprValuC+19] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v130, v[vgprValuC+20] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v130, v[vgprValuC+21] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v131, v[vgprValuC+22] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v131, v[vgprValuC+23] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(12) = 14 - 2 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v136, v[vgprValuC+24] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v136, v[vgprValuC+25] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v137, v[vgprValuC+26] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v137, v[vgprValuC+27] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v138, v[vgprValuC+28] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v138, v[vgprValuC+29] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v139, v[vgprValuC+30] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v139, v[vgprValuC+31] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(11) = 14 - 3 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v140, v[vgprValuC+32] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v140, v[vgprValuC+33] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v141, v[vgprValuC+34] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v141, v[vgprValuC+35] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v142, v[vgprValuC+36] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v142, v[vgprValuC+37] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v143, v[vgprValuC+38] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v143, v[vgprValuC+39] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(10) = 14 - 4 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v144, v[vgprValuC+40] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v144, v[vgprValuC+41] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v145, v[vgprValuC+42] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v145, v[vgprValuC+43] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v146, v[vgprValuC+44] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v146, v[vgprValuC+45] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v147, v[vgprValuC+46] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v147, v[vgprValuC+47] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(9) = 14 - 5 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v148, v[vgprValuC+48] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v148, v[vgprValuC+49] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v149, v[vgprValuC+50] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v149, v[vgprValuC+51] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v150, v[vgprValuC+52] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v150, v[vgprValuC+53] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v151, v[vgprValuC+54] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v151, v[vgprValuC+55] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(8) = 14 - 6 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v152, v[vgprValuC+56] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v152, v[vgprValuC+57] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v153, v[vgprValuC+58] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v153, v[vgprValuC+59] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v154, v[vgprValuC+60] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v154, v[vgprValuC+61] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v155, v[vgprValuC+62] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v155, v[vgprValuC+63] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(7) = 14 - 7 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v156, v[vgprValuC+64] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v156, v[vgprValuC+65] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v157, v[vgprValuC+66] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v157, v[vgprValuC+67] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v158, v[vgprValuC+68] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v158, v[vgprValuC+69] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v159, v[vgprValuC+70] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v159, v[vgprValuC+71] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(6) = 14 - 8 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v160, v[vgprValuC+72] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v160, v[vgprValuC+73] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v161, v[vgprValuC+74] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v161, v[vgprValuC+75] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v162, v[vgprValuC+76] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v162, v[vgprValuC+77] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v163, v[vgprValuC+78] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v163, v[vgprValuC+79] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(5) = 14 - 9 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v164, v[vgprValuC+80] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v164, v[vgprValuC+81] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v165, v[vgprValuC+82] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v165, v[vgprValuC+83] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v166, v[vgprValuC+84] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v166, v[vgprValuC+85] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v167, v[vgprValuC+86] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v167, v[vgprValuC+87] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(4) = 14 - 10 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v168, v[vgprValuC+88] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+89], s[sgprBeta], v168, v[vgprValuC+89] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+90], s[sgprBeta], v169, v[vgprValuC+90] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+91], s[sgprBeta], v169, v[vgprValuC+91] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+92], s[sgprBeta], v170, v[vgprValuC+92] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+93], s[sgprBeta], v170, v[vgprValuC+93] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+94], s[sgprBeta], v171, v[vgprValuC+94] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+95], s[sgprBeta], v171, v[vgprValuC+95] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(3) = 14 - 11 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+96], s[sgprBeta], v172, v[vgprValuC+96] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+97], s[sgprBeta], v172, v[vgprValuC+97] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+98], s[sgprBeta], v173, v[vgprValuC+98] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+99], s[sgprBeta], v173, v[vgprValuC+99] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+100], s[sgprBeta], v174, v[vgprValuC+100] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+101], s[sgprBeta], v174, v[vgprValuC+101] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+102], s[sgprBeta], v175, v[vgprValuC+102] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+103], s[sgprBeta], v175, v[vgprValuC+103] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(2) = 14 - 12 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+104], s[sgprBeta], v176, v[vgprValuC+104] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+105], s[sgprBeta], v176, v[vgprValuC+105] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+106], s[sgprBeta], v177, v[vgprValuC+106] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+107], s[sgprBeta], v177, v[vgprValuC+107] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+108], s[sgprBeta], v178, v[vgprValuC+108] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+109], s[sgprBeta], v178, v[vgprValuC+109] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+110], s[sgprBeta], v179, v[vgprValuC+110] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+111], s[sgprBeta], v179, v[vgprValuC+111] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(1) = 14 - 13 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+112], s[sgprBeta], v180, v[vgprValuC+112] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+113], s[sgprBeta], v180, v[vgprValuC+113] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+114], s[sgprBeta], v181, v[vgprValuC+114] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+115], s[sgprBeta], v181, v[vgprValuC+115] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+116], s[sgprBeta], v182, v[vgprValuC+116] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+117], s[sgprBeta], v182, v[vgprValuC+117] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+118], s[sgprBeta], v183, v[vgprValuC+118] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+119], s[sgprBeta], v183, v[vgprValuC+119] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D + +s_waitcnt vmcnt(13) // vmcnt(0) = 14 - 14 (beta) (interleaved) +v_fma_mix_f32 v[vgprValuC+120], s[sgprBeta], v184, v[vgprValuC+120] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+121], s[sgprBeta], v184, v[vgprValuC+121] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+122], s[sgprBeta], v185, v[vgprValuC+122] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+123], s[sgprBeta], v185, v[vgprValuC+123] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+124], s[sgprBeta], v186, v[vgprValuC+124] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+125], s[sgprBeta], v186, v[vgprValuC+125] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+126], s[sgprBeta], v187, v[vgprValuC+126] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+127], s[sgprBeta], v187, v[vgprValuC+127] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +s_lshl_b32 s12, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12 // incToNextRow: gra SRD += inc(lower) +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) +buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B1_E1_N: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=16 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v11, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[12:15], v11, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v11, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[128:131], v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[152:155], v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v156, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[160:163], v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[164:167], v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v158, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[168:171], v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[172:175], v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v176, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[180:183], v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[184:187], v178, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v178, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[188:191], v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[192:195], v196, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v196, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[200:203], v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[204:207], v198, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v198, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[208:211], v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[212:215], v216, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v216, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[220:223], v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+17], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+18], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+19], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+20], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+21], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+22], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+23], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+24], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+25], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+26], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+27], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+28], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+29], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+30], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+31], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+32], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+33], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+34], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+35], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+36], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+37], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+38], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+39], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+40], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+41], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+42], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+43], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+44], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+45], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+46], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+47], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+48], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+49], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+50], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+51], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+52], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+53], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+54], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+55], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+56], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+57], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+58], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+59], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+60], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+61], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+62], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+63], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+64], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+65], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+66], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+67], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+68], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+69], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+70], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+71], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+72], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+73], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+74], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+75], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+76], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+77], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+78], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+79], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+80], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+81], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+82], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+83], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+84], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+85], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+86], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+87], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+88], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+89], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+90], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+91], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+92], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+93], acc53 // copy acc to vreg[77] +v_accvgpr_read_b32 v[vgprValuC+94], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+95], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+96], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+97], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+98], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+99], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+100], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+101], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+102], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+103], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+104], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+105], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+106], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+107], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+108], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+109], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+110], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+111], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+112], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+113], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+114], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+115], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+116], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+117], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+118], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+119], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+120], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+121], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+122], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+123], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+124], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+125], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+126], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+127], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+136], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+137], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+138], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+139], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+140], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+141], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+142], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+143], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+144], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+145], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+146], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+147], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+148], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+149], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+150], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+151], acc253 // copy acc to vreg[127] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v12, v[vgprValuC+16] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v12, v[vgprValuC+17] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v13, v[vgprValuC+18] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v13, v[vgprValuC+19] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v14, v[vgprValuC+20] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v14, v[vgprValuC+21] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v15, v[vgprValuC+22] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v15, v[vgprValuC+23] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v128, v[vgprValuC+24] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v128, v[vgprValuC+25] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v129, v[vgprValuC+26] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v129, v[vgprValuC+27] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v130, v[vgprValuC+28] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v130, v[vgprValuC+29] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v131, v[vgprValuC+30] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v131, v[vgprValuC+31] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +buffer_store_dwordx4 v[24:27], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v152, v[vgprValuC+32] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v152, v[vgprValuC+33] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v153, v[vgprValuC+34] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v153, v[vgprValuC+35] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v154, v[vgprValuC+36] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v154, v[vgprValuC+37] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v155, v[vgprValuC+38] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v155, v[vgprValuC+39] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +buffer_store_dwordx4 v[32:35], v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v160, v[vgprValuC+40] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v160, v[vgprValuC+41] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v161, v[vgprValuC+42] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v161, v[vgprValuC+43] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v162, v[vgprValuC+44] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v162, v[vgprValuC+45] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v163, v[vgprValuC+46] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v163, v[vgprValuC+47] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +buffer_store_dwordx4 v[40:43], v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v164, v[vgprValuC+48] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v164, v[vgprValuC+49] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v165, v[vgprValuC+50] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v165, v[vgprValuC+51] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v166, v[vgprValuC+52] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v166, v[vgprValuC+53] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v167, v[vgprValuC+54] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v167, v[vgprValuC+55] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +buffer_store_dwordx4 v[48:51], v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v168, v[vgprValuC+56] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v168, v[vgprValuC+57] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v169, v[vgprValuC+58] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v169, v[vgprValuC+59] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v170, v[vgprValuC+60] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v170, v[vgprValuC+61] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v171, v[vgprValuC+62] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v171, v[vgprValuC+63] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +buffer_store_dwordx4 v[56:59], v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v172, v[vgprValuC+64] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v172, v[vgprValuC+65] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v173, v[vgprValuC+66] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v173, v[vgprValuC+67] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v174, v[vgprValuC+68] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v174, v[vgprValuC+69] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v175, v[vgprValuC+70] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v175, v[vgprValuC+71] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +buffer_store_dwordx4 v[64:67], v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v180, v[vgprValuC+72] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v180, v[vgprValuC+73] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v181, v[vgprValuC+74] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v181, v[vgprValuC+75] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v182, v[vgprValuC+76] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v182, v[vgprValuC+77] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v183, v[vgprValuC+78] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v183, v[vgprValuC+79] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +buffer_store_dwordx4 v[72:75], v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v184, v[vgprValuC+80] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v184, v[vgprValuC+81] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v185, v[vgprValuC+82] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v185, v[vgprValuC+83] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v186, v[vgprValuC+84] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v186, v[vgprValuC+85] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v187, v[vgprValuC+86] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v187, v[vgprValuC+87] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +buffer_store_dwordx4 v[80:83], v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v188, v[vgprValuC+88] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+89], s[sgprBeta], v188, v[vgprValuC+89] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+90], s[sgprBeta], v189, v[vgprValuC+90] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+91], s[sgprBeta], v189, v[vgprValuC+91] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+92], s[sgprBeta], v190, v[vgprValuC+92] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+93], s[sgprBeta], v190, v[vgprValuC+93] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+94], s[sgprBeta], v191, v[vgprValuC+94] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+95], s[sgprBeta], v191, v[vgprValuC+95] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +buffer_store_dwordx4 v[88:91], v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+96], s[sgprBeta], v192, v[vgprValuC+96] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+97], s[sgprBeta], v192, v[vgprValuC+97] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+98], s[sgprBeta], v193, v[vgprValuC+98] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+99], s[sgprBeta], v193, v[vgprValuC+99] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+100], s[sgprBeta], v194, v[vgprValuC+100] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+101], s[sgprBeta], v194, v[vgprValuC+101] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+102], s[sgprBeta], v195, v[vgprValuC+102] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+103], s[sgprBeta], v195, v[vgprValuC+103] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +buffer_store_dwordx4 v[96:99], v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+104], s[sgprBeta], v200, v[vgprValuC+104] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+105], s[sgprBeta], v200, v[vgprValuC+105] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+106], s[sgprBeta], v201, v[vgprValuC+106] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+107], s[sgprBeta], v201, v[vgprValuC+107] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+108], s[sgprBeta], v202, v[vgprValuC+108] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+109], s[sgprBeta], v202, v[vgprValuC+109] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+110], s[sgprBeta], v203, v[vgprValuC+110] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+111], s[sgprBeta], v203, v[vgprValuC+111] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +buffer_store_dwordx4 v[104:107], v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+112], s[sgprBeta], v204, v[vgprValuC+112] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+113], s[sgprBeta], v204, v[vgprValuC+113] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+114], s[sgprBeta], v205, v[vgprValuC+114] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+115], s[sgprBeta], v205, v[vgprValuC+115] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+116], s[sgprBeta], v206, v[vgprValuC+116] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+117], s[sgprBeta], v206, v[vgprValuC+117] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+118], s[sgprBeta], v207, v[vgprValuC+118] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+119], s[sgprBeta], v207, v[vgprValuC+119] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +buffer_store_dwordx4 v[112:115], v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+120], s[sgprBeta], v208, v[vgprValuC+120] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+121], s[sgprBeta], v208, v[vgprValuC+121] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+122], s[sgprBeta], v209, v[vgprValuC+122] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+123], s[sgprBeta], v209, v[vgprValuC+123] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+124], s[sgprBeta], v210, v[vgprValuC+124] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+125], s[sgprBeta], v210, v[vgprValuC+125] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+126], s[sgprBeta], v211, v[vgprValuC+126] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+127], s[sgprBeta], v211, v[vgprValuC+127] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +buffer_store_dwordx4 v[120:123], v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+136], s[sgprBeta], v212, v[vgprValuC+136] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+137], s[sgprBeta], v212, v[vgprValuC+137] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+138], s[sgprBeta], v213, v[vgprValuC+138] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+139], s[sgprBeta], v213, v[vgprValuC+139] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+140], s[sgprBeta], v214, v[vgprValuC+140] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+141], s[sgprBeta], v214, v[vgprValuC+141] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+142], s[sgprBeta], v215, v[vgprValuC+142] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+143], s[sgprBeta], v215, v[vgprValuC+143] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+136], v[vgprValuC+136] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+137], v[vgprValuC+137] // convert C to fp16 +v_pack_b32_f16 v136, v[vgprValuC+136], v[vgprValuC+137] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+138], v[vgprValuC+138] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+139], v[vgprValuC+139] // convert C to fp16 +v_pack_b32_f16 v137, v[vgprValuC+138], v[vgprValuC+139] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+140], v[vgprValuC+140] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+141], v[vgprValuC+141] // convert C to fp16 +v_pack_b32_f16 v138, v[vgprValuC+140], v[vgprValuC+141] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+142], v[vgprValuC+142] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+143], v[vgprValuC+143] // convert C to fp16 +v_pack_b32_f16 v139, v[vgprValuC+142], v[vgprValuC+143] // Pack with neighbor +buffer_store_dwordx4 v[136:139], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+144], s[sgprBeta], v220, v[vgprValuC+144] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+145], s[sgprBeta], v220, v[vgprValuC+145] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+146], s[sgprBeta], v221, v[vgprValuC+146] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+147], s[sgprBeta], v221, v[vgprValuC+147] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+148], s[sgprBeta], v222, v[vgprValuC+148] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+149], s[sgprBeta], v222, v[vgprValuC+149] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+150], s[sgprBeta], v223, v[vgprValuC+150] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+151], s[sgprBeta], v223, v[vgprValuC+151] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+144], v[vgprValuC+144] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+145], v[vgprValuC+145] // convert C to fp16 +v_pack_b32_f16 v144, v[vgprValuC+144], v[vgprValuC+145] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+146], v[vgprValuC+146] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+147], v[vgprValuC+147] // convert C to fp16 +v_pack_b32_f16 v145, v[vgprValuC+146], v[vgprValuC+147] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+148], v[vgprValuC+148] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+149], v[vgprValuC+149] // convert C to fp16 +v_pack_b32_f16 v146, v[vgprValuC+148], v[vgprValuC+149] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+150], v[vgprValuC+150] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+151], v[vgprValuC+151] // convert C to fp16 +v_pack_b32_f16 v147, v[vgprValuC+150], v[vgprValuC+151] // Pack with neighbor +buffer_store_dwordx4 v[144:147], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v11, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[12:15], v11, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v11, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v11, v10, v11, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[128:131], v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v156, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[152:155], v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v156, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v156, v10, v156, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[160:163], v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v158, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[164:167], v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v158, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v158, v10, v158, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[168:171], v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v176, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[172:175], v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v176, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v176, v10, v176, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[180:183], v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v178, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[184:187], v178, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v178, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v178, v10, v178, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[188:191], v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v196, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[192:195], v196, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v196, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v196, v10, v196, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[200:203], v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v198, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[204:207], v198, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v198, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v198, v10, v198, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[208:211], v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v216, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[212:215], v216, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v216, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v216, v10, v216, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_dwordx4 v[220:223], v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+16], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+17], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+18], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+19], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+20], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+21], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+22], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+23], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+24], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+25], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+26], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+27], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+28], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+29], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+30], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+31], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+32], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+33], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+34], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+35], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+36], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+37], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+38], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+39], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+40], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+41], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+42], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+43], acc110 // copy acc to vreg[155] +v_accvgpr_read_b32 v[vgprValuC+44], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+45], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+46], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+47], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+48], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+49], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+50], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+51], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+52], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+53], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+54], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+55], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+56], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+57], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+58], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+59], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+60], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+61], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+62], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+63], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+64], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+65], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+66], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+67], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+68], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+69], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+70], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+71], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+72], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+73], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+74], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+75], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+76], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+77], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+78], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+79], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+80], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+81], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+82], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+83], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+84], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+85], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+86], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+87], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+88], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+89], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+90], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+91], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+92], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+93], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+94], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+95], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+96], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+97], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+98], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+99], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+100], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+101], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+102], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+103], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+104], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+105], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+106], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+107], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+108], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+109], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+110], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+111], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+112], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+113], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+114], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+115], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+116], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+117], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+118], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+119], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+120], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+121], acc167 // copy acc to vreg[233] +v_accvgpr_read_b32 v[vgprValuC+122], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+123], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+124], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+125], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+126], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+127], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+136], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+137], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+138], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+139], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+140], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+141], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+142], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+143], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+144], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+145], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+146], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+147], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+148], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+149], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+150], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+151], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */ +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk) +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v12, v[vgprValuC+16] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v12, v[vgprValuC+17] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v13, v[vgprValuC+18] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v13, v[vgprValuC+19] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v14, v[vgprValuC+20] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v14, v[vgprValuC+21] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v15, v[vgprValuC+22] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v15, v[vgprValuC+23] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+16], v[vgprValuC+16] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+17], v[vgprValuC+17] // convert C to fp16 +v_pack_b32_f16 v16, v[vgprValuC+16], v[vgprValuC+17] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+18], v[vgprValuC+18] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+19], v[vgprValuC+19] // convert C to fp16 +v_pack_b32_f16 v17, v[vgprValuC+18], v[vgprValuC+19] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+20], v[vgprValuC+20] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+21], v[vgprValuC+21] // convert C to fp16 +v_pack_b32_f16 v18, v[vgprValuC+20], v[vgprValuC+21] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+22], v[vgprValuC+22] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+23], v[vgprValuC+23] // convert C to fp16 +v_pack_b32_f16 v19, v[vgprValuC+22], v[vgprValuC+23] // Pack with neighbor +buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v128, v[vgprValuC+24] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v128, v[vgprValuC+25] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v129, v[vgprValuC+26] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v129, v[vgprValuC+27] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v130, v[vgprValuC+28] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v130, v[vgprValuC+29] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v131, v[vgprValuC+30] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v131, v[vgprValuC+31] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+24], v[vgprValuC+24] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+25], v[vgprValuC+25] // convert C to fp16 +v_pack_b32_f16 v24, v[vgprValuC+24], v[vgprValuC+25] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+26], v[vgprValuC+26] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+27], v[vgprValuC+27] // convert C to fp16 +v_pack_b32_f16 v25, v[vgprValuC+26], v[vgprValuC+27] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+28], v[vgprValuC+28] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+29], v[vgprValuC+29] // convert C to fp16 +v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 +v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor +buffer_store_dwordx4 v[24:27], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v152, v[vgprValuC+32] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v152, v[vgprValuC+33] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v153, v[vgprValuC+34] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v153, v[vgprValuC+35] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v154, v[vgprValuC+36] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v154, v[vgprValuC+37] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v155, v[vgprValuC+38] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v155, v[vgprValuC+39] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+32], v[vgprValuC+32] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+33], v[vgprValuC+33] // convert C to fp16 +v_pack_b32_f16 v32, v[vgprValuC+32], v[vgprValuC+33] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+34], v[vgprValuC+34] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+35], v[vgprValuC+35] // convert C to fp16 +v_pack_b32_f16 v33, v[vgprValuC+34], v[vgprValuC+35] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+36], v[vgprValuC+36] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+37], v[vgprValuC+37] // convert C to fp16 +v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 +v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor +buffer_store_dwordx4 v[32:35], v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v160, v[vgprValuC+40] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v160, v[vgprValuC+41] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v161, v[vgprValuC+42] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v161, v[vgprValuC+43] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v162, v[vgprValuC+44] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v162, v[vgprValuC+45] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v163, v[vgprValuC+46] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v163, v[vgprValuC+47] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+40], v[vgprValuC+40] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+41], v[vgprValuC+41] // convert C to fp16 +v_pack_b32_f16 v40, v[vgprValuC+40], v[vgprValuC+41] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+42], v[vgprValuC+42] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+43], v[vgprValuC+43] // convert C to fp16 +v_pack_b32_f16 v41, v[vgprValuC+42], v[vgprValuC+43] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+44], v[vgprValuC+44] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+45], v[vgprValuC+45] // convert C to fp16 +v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 +v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor +buffer_store_dwordx4 v[40:43], v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v164, v[vgprValuC+48] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v164, v[vgprValuC+49] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v165, v[vgprValuC+50] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v165, v[vgprValuC+51] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v166, v[vgprValuC+52] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v166, v[vgprValuC+53] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v167, v[vgprValuC+54] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v167, v[vgprValuC+55] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+48], v[vgprValuC+48] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+49], v[vgprValuC+49] // convert C to fp16 +v_pack_b32_f16 v48, v[vgprValuC+48], v[vgprValuC+49] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+50], v[vgprValuC+50] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+51], v[vgprValuC+51] // convert C to fp16 +v_pack_b32_f16 v49, v[vgprValuC+50], v[vgprValuC+51] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+52], v[vgprValuC+52] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+53], v[vgprValuC+53] // convert C to fp16 +v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 +v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor +buffer_store_dwordx4 v[48:51], v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v168, v[vgprValuC+56] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v168, v[vgprValuC+57] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v169, v[vgprValuC+58] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v169, v[vgprValuC+59] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v170, v[vgprValuC+60] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v170, v[vgprValuC+61] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v171, v[vgprValuC+62] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v171, v[vgprValuC+63] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+56], v[vgprValuC+56] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+57], v[vgprValuC+57] // convert C to fp16 +v_pack_b32_f16 v56, v[vgprValuC+56], v[vgprValuC+57] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+58], v[vgprValuC+58] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+59], v[vgprValuC+59] // convert C to fp16 +v_pack_b32_f16 v57, v[vgprValuC+58], v[vgprValuC+59] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+60], v[vgprValuC+60] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+61], v[vgprValuC+61] // convert C to fp16 +v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 +v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor +buffer_store_dwordx4 v[56:59], v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v172, v[vgprValuC+64] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v172, v[vgprValuC+65] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v173, v[vgprValuC+66] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v173, v[vgprValuC+67] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v174, v[vgprValuC+68] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v174, v[vgprValuC+69] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v175, v[vgprValuC+70] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v175, v[vgprValuC+71] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+64], v[vgprValuC+64] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+65], v[vgprValuC+65] // convert C to fp16 +v_pack_b32_f16 v64, v[vgprValuC+64], v[vgprValuC+65] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+66], v[vgprValuC+66] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+67], v[vgprValuC+67] // convert C to fp16 +v_pack_b32_f16 v65, v[vgprValuC+66], v[vgprValuC+67] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+68], v[vgprValuC+68] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+69], v[vgprValuC+69] // convert C to fp16 +v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 +v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor +buffer_store_dwordx4 v[64:67], v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v180, v[vgprValuC+72] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v180, v[vgprValuC+73] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v181, v[vgprValuC+74] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v181, v[vgprValuC+75] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v182, v[vgprValuC+76] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v182, v[vgprValuC+77] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v183, v[vgprValuC+78] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v183, v[vgprValuC+79] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+72], v[vgprValuC+72] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+73], v[vgprValuC+73] // convert C to fp16 +v_pack_b32_f16 v72, v[vgprValuC+72], v[vgprValuC+73] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+74], v[vgprValuC+74] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+75], v[vgprValuC+75] // convert C to fp16 +v_pack_b32_f16 v73, v[vgprValuC+74], v[vgprValuC+75] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+76], v[vgprValuC+76] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+77], v[vgprValuC+77] // convert C to fp16 +v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 +v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor +buffer_store_dwordx4 v[72:75], v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v184, v[vgprValuC+80] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v184, v[vgprValuC+81] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v185, v[vgprValuC+82] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v185, v[vgprValuC+83] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v186, v[vgprValuC+84] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v186, v[vgprValuC+85] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v187, v[vgprValuC+86] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v187, v[vgprValuC+87] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+80], v[vgprValuC+80] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+81], v[vgprValuC+81] // convert C to fp16 +v_pack_b32_f16 v80, v[vgprValuC+80], v[vgprValuC+81] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+82], v[vgprValuC+82] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+83], v[vgprValuC+83] // convert C to fp16 +v_pack_b32_f16 v81, v[vgprValuC+82], v[vgprValuC+83] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+84], v[vgprValuC+84] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+85], v[vgprValuC+85] // convert C to fp16 +v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 +v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor +buffer_store_dwordx4 v[80:83], v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v188, v[vgprValuC+88] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+89], s[sgprBeta], v188, v[vgprValuC+89] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+90], s[sgprBeta], v189, v[vgprValuC+90] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+91], s[sgprBeta], v189, v[vgprValuC+91] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+92], s[sgprBeta], v190, v[vgprValuC+92] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+93], s[sgprBeta], v190, v[vgprValuC+93] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+94], s[sgprBeta], v191, v[vgprValuC+94] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+95], s[sgprBeta], v191, v[vgprValuC+95] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+88], v[vgprValuC+88] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+89], v[vgprValuC+89] // convert C to fp16 +v_pack_b32_f16 v88, v[vgprValuC+88], v[vgprValuC+89] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+90], v[vgprValuC+90] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+91], v[vgprValuC+91] // convert C to fp16 +v_pack_b32_f16 v89, v[vgprValuC+90], v[vgprValuC+91] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+92], v[vgprValuC+92] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+93], v[vgprValuC+93] // convert C to fp16 +v_pack_b32_f16 v90, v[vgprValuC+92], v[vgprValuC+93] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+94], v[vgprValuC+94] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+95], v[vgprValuC+95] // convert C to fp16 +v_pack_b32_f16 v91, v[vgprValuC+94], v[vgprValuC+95] // Pack with neighbor +buffer_store_dwordx4 v[88:91], v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+96], s[sgprBeta], v192, v[vgprValuC+96] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+97], s[sgprBeta], v192, v[vgprValuC+97] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+98], s[sgprBeta], v193, v[vgprValuC+98] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+99], s[sgprBeta], v193, v[vgprValuC+99] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+100], s[sgprBeta], v194, v[vgprValuC+100] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+101], s[sgprBeta], v194, v[vgprValuC+101] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+102], s[sgprBeta], v195, v[vgprValuC+102] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+103], s[sgprBeta], v195, v[vgprValuC+103] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+96], v[vgprValuC+96] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+97], v[vgprValuC+97] // convert C to fp16 +v_pack_b32_f16 v96, v[vgprValuC+96], v[vgprValuC+97] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+98], v[vgprValuC+98] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+99], v[vgprValuC+99] // convert C to fp16 +v_pack_b32_f16 v97, v[vgprValuC+98], v[vgprValuC+99] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+100], v[vgprValuC+100] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+101], v[vgprValuC+101] // convert C to fp16 +v_pack_b32_f16 v98, v[vgprValuC+100], v[vgprValuC+101] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+102], v[vgprValuC+102] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+103], v[vgprValuC+103] // convert C to fp16 +v_pack_b32_f16 v99, v[vgprValuC+102], v[vgprValuC+103] // Pack with neighbor +buffer_store_dwordx4 v[96:99], v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+104], s[sgprBeta], v200, v[vgprValuC+104] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+105], s[sgprBeta], v200, v[vgprValuC+105] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+106], s[sgprBeta], v201, v[vgprValuC+106] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+107], s[sgprBeta], v201, v[vgprValuC+107] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+108], s[sgprBeta], v202, v[vgprValuC+108] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+109], s[sgprBeta], v202, v[vgprValuC+109] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+110], s[sgprBeta], v203, v[vgprValuC+110] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+111], s[sgprBeta], v203, v[vgprValuC+111] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+104], v[vgprValuC+104] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+105], v[vgprValuC+105] // convert C to fp16 +v_pack_b32_f16 v104, v[vgprValuC+104], v[vgprValuC+105] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+106], v[vgprValuC+106] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+107], v[vgprValuC+107] // convert C to fp16 +v_pack_b32_f16 v105, v[vgprValuC+106], v[vgprValuC+107] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+108], v[vgprValuC+108] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+109], v[vgprValuC+109] // convert C to fp16 +v_pack_b32_f16 v106, v[vgprValuC+108], v[vgprValuC+109] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+110], v[vgprValuC+110] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+111], v[vgprValuC+111] // convert C to fp16 +v_pack_b32_f16 v107, v[vgprValuC+110], v[vgprValuC+111] // Pack with neighbor +buffer_store_dwordx4 v[104:107], v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+112], s[sgprBeta], v204, v[vgprValuC+112] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+113], s[sgprBeta], v204, v[vgprValuC+113] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+114], s[sgprBeta], v205, v[vgprValuC+114] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+115], s[sgprBeta], v205, v[vgprValuC+115] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+116], s[sgprBeta], v206, v[vgprValuC+116] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+117], s[sgprBeta], v206, v[vgprValuC+117] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+118], s[sgprBeta], v207, v[vgprValuC+118] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+119], s[sgprBeta], v207, v[vgprValuC+119] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+112], v[vgprValuC+112] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+113], v[vgprValuC+113] // convert C to fp16 +v_pack_b32_f16 v112, v[vgprValuC+112], v[vgprValuC+113] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+114], v[vgprValuC+114] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+115], v[vgprValuC+115] // convert C to fp16 +v_pack_b32_f16 v113, v[vgprValuC+114], v[vgprValuC+115] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+116], v[vgprValuC+116] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+117], v[vgprValuC+117] // convert C to fp16 +v_pack_b32_f16 v114, v[vgprValuC+116], v[vgprValuC+117] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+118], v[vgprValuC+118] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+119], v[vgprValuC+119] // convert C to fp16 +v_pack_b32_f16 v115, v[vgprValuC+118], v[vgprValuC+119] // Pack with neighbor +buffer_store_dwordx4 v[112:115], v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+120], s[sgprBeta], v208, v[vgprValuC+120] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+121], s[sgprBeta], v208, v[vgprValuC+121] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+122], s[sgprBeta], v209, v[vgprValuC+122] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+123], s[sgprBeta], v209, v[vgprValuC+123] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+124], s[sgprBeta], v210, v[vgprValuC+124] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+125], s[sgprBeta], v210, v[vgprValuC+125] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+126], s[sgprBeta], v211, v[vgprValuC+126] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+127], s[sgprBeta], v211, v[vgprValuC+127] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+120], v[vgprValuC+120] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+121], v[vgprValuC+121] // convert C to fp16 +v_pack_b32_f16 v120, v[vgprValuC+120], v[vgprValuC+121] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+122], v[vgprValuC+122] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+123], v[vgprValuC+123] // convert C to fp16 +v_pack_b32_f16 v121, v[vgprValuC+122], v[vgprValuC+123] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+124], v[vgprValuC+124] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+125], v[vgprValuC+125] // convert C to fp16 +v_pack_b32_f16 v122, v[vgprValuC+124], v[vgprValuC+125] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+126], v[vgprValuC+126] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+127], v[vgprValuC+127] // convert C to fp16 +v_pack_b32_f16 v123, v[vgprValuC+126], v[vgprValuC+127] // Pack with neighbor +buffer_store_dwordx4 v[120:123], v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+136], s[sgprBeta], v212, v[vgprValuC+136] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+137], s[sgprBeta], v212, v[vgprValuC+137] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+138], s[sgprBeta], v213, v[vgprValuC+138] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+139], s[sgprBeta], v213, v[vgprValuC+139] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+140], s[sgprBeta], v214, v[vgprValuC+140] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+141], s[sgprBeta], v214, v[vgprValuC+141] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+142], s[sgprBeta], v215, v[vgprValuC+142] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+143], s[sgprBeta], v215, v[vgprValuC+143] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+136], v[vgprValuC+136] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+137], v[vgprValuC+137] // convert C to fp16 +v_pack_b32_f16 v136, v[vgprValuC+136], v[vgprValuC+137] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+138], v[vgprValuC+138] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+139], v[vgprValuC+139] // convert C to fp16 +v_pack_b32_f16 v137, v[vgprValuC+138], v[vgprValuC+139] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+140], v[vgprValuC+140] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+141], v[vgprValuC+141] // convert C to fp16 +v_pack_b32_f16 v138, v[vgprValuC+140], v[vgprValuC+141] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+142], v[vgprValuC+142] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+143], v[vgprValuC+143] // convert C to fp16 +v_pack_b32_f16 v139, v[vgprValuC+142], v[vgprValuC+143] // Pack with neighbor +buffer_store_dwordx4 v[136:139], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+144], s[sgprBeta], v220, v[vgprValuC+144] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+145], s[sgprBeta], v220, v[vgprValuC+145] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+146], s[sgprBeta], v221, v[vgprValuC+146] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+147], s[sgprBeta], v221, v[vgprValuC+147] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+148], s[sgprBeta], v222, v[vgprValuC+148] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+149], s[sgprBeta], v222, v[vgprValuC+149] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+150], s[sgprBeta], v223, v[vgprValuC+150] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_fma_mix_f32 v[vgprValuC+151], s[sgprBeta], v223, v[vgprValuC+151] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v[vgprValuC+144], v[vgprValuC+144] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+145], v[vgprValuC+145] // convert C to fp16 +v_pack_b32_f16 v144, v[vgprValuC+144], v[vgprValuC+145] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+146], v[vgprValuC+146] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+147], v[vgprValuC+147] // convert C to fp16 +v_pack_b32_f16 v145, v[vgprValuC+146], v[vgprValuC+147] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+148], v[vgprValuC+148] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+149], v[vgprValuC+149] // convert C to fp16 +v_pack_b32_f16 v146, v[vgprValuC+148], v[vgprValuC+149] // Pack with neighbor +v_cvt_f16_f32 v[vgprValuC+150], v[vgprValuC+150] // convert C to fp16 +v_cvt_f16_f32 v[vgprValuC+151], v[vgprValuC+151] // convert C to fp16 +v_pack_b32_f16 v147, v[vgprValuC+150], v[vgprValuC+151] // Pack with neighbor +buffer_store_dwordx4 v[144:147], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_B1_E1_M: + +/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=78 */ +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ +/* (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,0,0,0) */ +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v90, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v89, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v90, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,0,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,1,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,2,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,3,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,4,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,5,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,6,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,7,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,8,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v247, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v247, v10, v247, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v246, v247, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v247, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v247, v10, v247, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc0 // copy acc to vreg[0] +v_accvgpr_read_b32 v[vgprValuC+12], acc4 // copy acc to vreg[1] +v_accvgpr_read_b32 v[vgprValuC+13], acc8 // copy acc to vreg[2] +v_accvgpr_read_b32 v[vgprValuC+14], acc12 // copy acc to vreg[3] +v_accvgpr_read_b32 v[vgprValuC+15], acc16 // copy acc to vreg[4] +v_accvgpr_read_b32 v[vgprValuC+16], acc20 // copy acc to vreg[5] +v_accvgpr_read_b32 v[vgprValuC+17], acc24 // copy acc to vreg[6] +v_accvgpr_read_b32 v[vgprValuC+18], acc28 // copy acc to vreg[7] +v_accvgpr_read_b32 v[vgprValuC+19], acc32 // copy acc to vreg[8] +v_accvgpr_read_b32 v[vgprValuC+20], acc36 // copy acc to vreg[9] +v_accvgpr_read_b32 v[vgprValuC+21], acc40 // copy acc to vreg[10] +v_accvgpr_read_b32 v[vgprValuC+22], acc44 // copy acc to vreg[11] +v_accvgpr_read_b32 v[vgprValuC+23], acc48 // copy acc to vreg[12] +v_accvgpr_read_b32 v[vgprValuC+24], acc52 // copy acc to vreg[13] +v_accvgpr_read_b32 v[vgprValuC+25], acc56 // copy acc to vreg[14] +v_accvgpr_read_b32 v[vgprValuC+26], acc60 // copy acc to vreg[15] +v_accvgpr_read_b32 v[vgprValuC+27], acc64 // copy acc to vreg[16] +v_accvgpr_read_b32 v[vgprValuC+28], acc68 // copy acc to vreg[17] +v_accvgpr_read_b32 v[vgprValuC+29], acc72 // copy acc to vreg[18] +v_accvgpr_read_b32 v[vgprValuC+30], acc76 // copy acc to vreg[19] +v_accvgpr_read_b32 v[vgprValuC+31], acc80 // copy acc to vreg[20] +v_accvgpr_read_b32 v[vgprValuC+32], acc84 // copy acc to vreg[21] +v_accvgpr_read_b32 v[vgprValuC+33], acc88 // copy acc to vreg[22] +v_accvgpr_read_b32 v[vgprValuC+34], acc92 // copy acc to vreg[23] +v_accvgpr_read_b32 v[vgprValuC+35], acc96 // copy acc to vreg[24] +v_accvgpr_read_b32 v[vgprValuC+36], acc100 // copy acc to vreg[25] +v_accvgpr_read_b32 v[vgprValuC+37], acc104 // copy acc to vreg[26] +v_accvgpr_read_b32 v[vgprValuC+38], acc108 // copy acc to vreg[27] +v_accvgpr_read_b32 v[vgprValuC+39], acc112 // copy acc to vreg[28] +v_accvgpr_read_b32 v[vgprValuC+40], acc116 // copy acc to vreg[29] +v_accvgpr_read_b32 v[vgprValuC+41], acc120 // copy acc to vreg[30] +v_accvgpr_read_b32 v[vgprValuC+42], acc124 // copy acc to vreg[31] +v_accvgpr_read_b32 v[vgprValuC+43], acc128 // copy acc to vreg[32] +v_accvgpr_read_b32 v[vgprValuC+44], acc132 // copy acc to vreg[33] +v_accvgpr_read_b32 v[vgprValuC+45], acc136 // copy acc to vreg[34] +v_accvgpr_read_b32 v[vgprValuC+46], acc140 // copy acc to vreg[35] +v_accvgpr_read_b32 v[vgprValuC+47], acc144 // copy acc to vreg[36] +v_accvgpr_read_b32 v[vgprValuC+48], acc148 // copy acc to vreg[37] +v_accvgpr_read_b32 v[vgprValuC+49], acc152 // copy acc to vreg[38] +v_accvgpr_read_b32 v[vgprValuC+50], acc156 // copy acc to vreg[39] +v_accvgpr_read_b32 v[vgprValuC+51], acc160 // copy acc to vreg[40] +v_accvgpr_read_b32 v[vgprValuC+52], acc164 // copy acc to vreg[41] +v_accvgpr_read_b32 v[vgprValuC+53], acc168 // copy acc to vreg[42] +v_accvgpr_read_b32 v[vgprValuC+54], acc172 // copy acc to vreg[43] +v_accvgpr_read_b32 v[vgprValuC+55], acc176 // copy acc to vreg[44] +v_accvgpr_read_b32 v[vgprValuC+56], acc180 // copy acc to vreg[45] +v_accvgpr_read_b32 v[vgprValuC+57], acc184 // copy acc to vreg[46] +v_accvgpr_read_b32 v[vgprValuC+58], acc188 // copy acc to vreg[47] +v_accvgpr_read_b32 v[vgprValuC+59], acc192 // copy acc to vreg[48] +v_accvgpr_read_b32 v[vgprValuC+60], acc196 // copy acc to vreg[49] +v_accvgpr_read_b32 v[vgprValuC+61], acc200 // copy acc to vreg[50] +v_accvgpr_read_b32 v[vgprValuC+62], acc204 // copy acc to vreg[51] +v_accvgpr_read_b32 v[vgprValuC+63], acc208 // copy acc to vreg[52] +v_accvgpr_read_b32 v[vgprValuC+64], acc212 // copy acc to vreg[53] +v_accvgpr_read_b32 v[vgprValuC+65], acc216 // copy acc to vreg[54] +v_accvgpr_read_b32 v[vgprValuC+66], acc220 // copy acc to vreg[55] +v_accvgpr_read_b32 v[vgprValuC+67], acc224 // copy acc to vreg[56] +v_accvgpr_read_b32 v[vgprValuC+68], acc228 // copy acc to vreg[57] +v_accvgpr_read_b32 v[vgprValuC+69], acc232 // copy acc to vreg[58] +v_accvgpr_read_b32 v[vgprValuC+70], acc236 // copy acc to vreg[59] +v_accvgpr_read_b32 v[vgprValuC+71], acc240 // copy acc to vreg[60] +v_accvgpr_read_b32 v[vgprValuC+72], acc244 // copy acc to vreg[61] +v_accvgpr_read_b32 v[vgprValuC+73], acc248 // copy acc to vreg[62] +v_accvgpr_read_b32 v[vgprValuC+74], acc252 // copy acc to vreg[63] +v_accvgpr_read_b32 v[vgprValuC+75], acc1 // copy acc to vreg[64] +v_accvgpr_read_b32 v[vgprValuC+76], acc5 // copy acc to vreg[65] +v_accvgpr_read_b32 v[vgprValuC+77], acc9 // copy acc to vreg[66] +v_accvgpr_read_b32 v[vgprValuC+78], acc13 // copy acc to vreg[67] +v_accvgpr_read_b32 v[vgprValuC+79], acc17 // copy acc to vreg[68] +v_accvgpr_read_b32 v[vgprValuC+80], acc21 // copy acc to vreg[69] +v_accvgpr_read_b32 v[vgprValuC+81], acc25 // copy acc to vreg[70] +v_accvgpr_read_b32 v[vgprValuC+82], acc29 // copy acc to vreg[71] +v_accvgpr_read_b32 v[vgprValuC+83], acc33 // copy acc to vreg[72] +v_accvgpr_read_b32 v[vgprValuC+84], acc37 // copy acc to vreg[73] +v_accvgpr_read_b32 v[vgprValuC+85], acc41 // copy acc to vreg[74] +v_accvgpr_read_b32 v[vgprValuC+86], acc45 // copy acc to vreg[75] +v_accvgpr_read_b32 v[vgprValuC+87], acc49 // copy acc to vreg[76] +v_accvgpr_read_b32 v[vgprValuC+88], acc53 // copy acc to vreg[77] + +/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+88], s[sgprAlpha], v[vgprValuC+88] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_fma_mix_f32 v[vgprValuC+11], s[sgprBeta], v89, v[vgprValuC+11] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+12], s[sgprBeta], v91, v[vgprValuC+12] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+13], s[sgprBeta], v93, v[vgprValuC+13] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+14], s[sgprBeta], v95, v[vgprValuC+14] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+15], s[sgprBeta], v97, v[vgprValuC+15] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v99, v[vgprValuC+16] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v101, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v103, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v105, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v107, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v109, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v111, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v113, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v115, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v117, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v119, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v121, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v123, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v125, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v127, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v129, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v131, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v136, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 +buffer_store_short v33, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v138, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 +buffer_store_short v34, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v140, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 +buffer_store_short v35, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v142, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 +buffer_store_short v36, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v144, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 +buffer_store_short v37, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v146, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 +buffer_store_short v38, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v148, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 +buffer_store_short v39, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v150, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 +buffer_store_short v40, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v152, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 +buffer_store_short v41, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v154, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 +buffer_store_short v42, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v156, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 +buffer_store_short v43, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v158, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 +buffer_store_short v44, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v160, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 +buffer_store_short v45, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v162, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 +buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v166, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 +buffer_store_short v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v168, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 +buffer_store_short v49, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v170, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 +buffer_store_short v50, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v172, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 +buffer_store_short v51, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v174, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 +buffer_store_short v52, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v176, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 +buffer_store_short v53, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v178, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 +buffer_store_short v54, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v180, v[vgprValuC+55] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 +buffer_store_short v55, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v182, v[vgprValuC+56] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 +buffer_store_short v56, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v184, v[vgprValuC+57] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 +buffer_store_short v57, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v186, v[vgprValuC+58] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 +buffer_store_short v58, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v188, v[vgprValuC+59] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 +buffer_store_short v59, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v190, v[vgprValuC+60] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 +buffer_store_short v60, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v192, v[vgprValuC+61] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v61, v[vgprValuC+61] // convert C to fp16 +buffer_store_short v61, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v194, v[vgprValuC+62] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v62, v[vgprValuC+62] // convert C to fp16 +buffer_store_short v62, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v196, v[vgprValuC+63] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v63, v[vgprValuC+63] // convert C to fp16 +buffer_store_short v63, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v198, v[vgprValuC+64] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v64, v[vgprValuC+64] // convert C to fp16 +buffer_store_short v64, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v200, v[vgprValuC+65] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v65, v[vgprValuC+65] // convert C to fp16 +buffer_store_short v65, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v202, v[vgprValuC+66] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v66, v[vgprValuC+66] // convert C to fp16 +buffer_store_short v66, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v204, v[vgprValuC+67] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v67, v[vgprValuC+67] // convert C to fp16 +buffer_store_short v67, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v206, v[vgprValuC+68] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v68, v[vgprValuC+68] // convert C to fp16 +buffer_store_short v68, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v208, v[vgprValuC+69] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v69, v[vgprValuC+69] // convert C to fp16 +buffer_store_short v69, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v210, v[vgprValuC+70] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v70, v[vgprValuC+70] // convert C to fp16 +buffer_store_short v70, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v212, v[vgprValuC+71] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v71, v[vgprValuC+71] // convert C to fp16 +buffer_store_short v71, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v214, v[vgprValuC+72] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v72, v[vgprValuC+72] // convert C to fp16 +buffer_store_short v72, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v216, v[vgprValuC+73] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v73, v[vgprValuC+73] // convert C to fp16 +buffer_store_short v73, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v218, v[vgprValuC+74] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v74, v[vgprValuC+74] // convert C to fp16 +buffer_store_short v74, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v220, v[vgprValuC+75] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v75, v[vgprValuC+75] // convert C to fp16 +buffer_store_short v75, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v222, v[vgprValuC+76] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v76, v[vgprValuC+76] // convert C to fp16 +buffer_store_short v76, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v224, v[vgprValuC+77] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v77, v[vgprValuC+77] // convert C to fp16 +buffer_store_short v77, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v226, v[vgprValuC+78] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v78, v[vgprValuC+78] // convert C to fp16 +buffer_store_short v78, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v228, v[vgprValuC+79] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v79, v[vgprValuC+79] // convert C to fp16 +buffer_store_short v79, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v230, v[vgprValuC+80] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v80, v[vgprValuC+80] // convert C to fp16 +buffer_store_short v80, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v232, v[vgprValuC+81] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v81, v[vgprValuC+81] // convert C to fp16 +buffer_store_short v81, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v234, v[vgprValuC+82] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v82, v[vgprValuC+82] // convert C to fp16 +buffer_store_short v82, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v236, v[vgprValuC+83] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v83, v[vgprValuC+83] // convert C to fp16 +buffer_store_short v83, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v238, v[vgprValuC+84] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v84, v[vgprValuC+84] // convert C to fp16 +buffer_store_short v84, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v240, v[vgprValuC+85] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v85, v[vgprValuC+85] // convert C to fp16 +buffer_store_short v85, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v242, v[vgprValuC+86] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v86, v[vgprValuC+86] // convert C to fp16 +buffer_store_short v86, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v244, v[vgprValuC+87] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v87, v[vgprValuC+87] // convert C to fp16 +buffer_store_short v87, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v246, v[vgprValuC+88] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v88, v[vgprValuC+88] // convert C to fp16 +buffer_store_short v88, v247, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ +/* (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,9,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v89, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,9,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,10,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,11,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,12,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,13,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,14,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,15,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,16,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,17,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,18,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v247, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v247, v10, v247, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v246, v247, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v247, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v247, v10, v247, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc57 // copy acc to vreg[78] +v_accvgpr_read_b32 v[vgprValuC+12], acc61 // copy acc to vreg[79] +v_accvgpr_read_b32 v[vgprValuC+13], acc65 // copy acc to vreg[80] +v_accvgpr_read_b32 v[vgprValuC+14], acc69 // copy acc to vreg[81] +v_accvgpr_read_b32 v[vgprValuC+15], acc73 // copy acc to vreg[82] +v_accvgpr_read_b32 v[vgprValuC+16], acc77 // copy acc to vreg[83] +v_accvgpr_read_b32 v[vgprValuC+17], acc81 // copy acc to vreg[84] +v_accvgpr_read_b32 v[vgprValuC+18], acc85 // copy acc to vreg[85] +v_accvgpr_read_b32 v[vgprValuC+19], acc89 // copy acc to vreg[86] +v_accvgpr_read_b32 v[vgprValuC+20], acc93 // copy acc to vreg[87] +v_accvgpr_read_b32 v[vgprValuC+21], acc97 // copy acc to vreg[88] +v_accvgpr_read_b32 v[vgprValuC+22], acc101 // copy acc to vreg[89] +v_accvgpr_read_b32 v[vgprValuC+23], acc105 // copy acc to vreg[90] +v_accvgpr_read_b32 v[vgprValuC+24], acc109 // copy acc to vreg[91] +v_accvgpr_read_b32 v[vgprValuC+25], acc113 // copy acc to vreg[92] +v_accvgpr_read_b32 v[vgprValuC+26], acc117 // copy acc to vreg[93] +v_accvgpr_read_b32 v[vgprValuC+27], acc121 // copy acc to vreg[94] +v_accvgpr_read_b32 v[vgprValuC+28], acc125 // copy acc to vreg[95] +v_accvgpr_read_b32 v[vgprValuC+29], acc129 // copy acc to vreg[96] +v_accvgpr_read_b32 v[vgprValuC+30], acc133 // copy acc to vreg[97] +v_accvgpr_read_b32 v[vgprValuC+31], acc137 // copy acc to vreg[98] +v_accvgpr_read_b32 v[vgprValuC+32], acc141 // copy acc to vreg[99] +v_accvgpr_read_b32 v[vgprValuC+33], acc145 // copy acc to vreg[100] +v_accvgpr_read_b32 v[vgprValuC+34], acc149 // copy acc to vreg[101] +v_accvgpr_read_b32 v[vgprValuC+35], acc153 // copy acc to vreg[102] +v_accvgpr_read_b32 v[vgprValuC+36], acc157 // copy acc to vreg[103] +v_accvgpr_read_b32 v[vgprValuC+37], acc161 // copy acc to vreg[104] +v_accvgpr_read_b32 v[vgprValuC+38], acc165 // copy acc to vreg[105] +v_accvgpr_read_b32 v[vgprValuC+39], acc169 // copy acc to vreg[106] +v_accvgpr_read_b32 v[vgprValuC+40], acc173 // copy acc to vreg[107] +v_accvgpr_read_b32 v[vgprValuC+41], acc177 // copy acc to vreg[108] +v_accvgpr_read_b32 v[vgprValuC+42], acc181 // copy acc to vreg[109] +v_accvgpr_read_b32 v[vgprValuC+43], acc185 // copy acc to vreg[110] +v_accvgpr_read_b32 v[vgprValuC+44], acc189 // copy acc to vreg[111] +v_accvgpr_read_b32 v[vgprValuC+45], acc193 // copy acc to vreg[112] +v_accvgpr_read_b32 v[vgprValuC+46], acc197 // copy acc to vreg[113] +v_accvgpr_read_b32 v[vgprValuC+47], acc201 // copy acc to vreg[114] +v_accvgpr_read_b32 v[vgprValuC+48], acc205 // copy acc to vreg[115] +v_accvgpr_read_b32 v[vgprValuC+49], acc209 // copy acc to vreg[116] +v_accvgpr_read_b32 v[vgprValuC+50], acc213 // copy acc to vreg[117] +v_accvgpr_read_b32 v[vgprValuC+51], acc217 // copy acc to vreg[118] +v_accvgpr_read_b32 v[vgprValuC+52], acc221 // copy acc to vreg[119] +v_accvgpr_read_b32 v[vgprValuC+53], acc225 // copy acc to vreg[120] +v_accvgpr_read_b32 v[vgprValuC+54], acc229 // copy acc to vreg[121] +v_accvgpr_read_b32 v[vgprValuC+55], acc233 // copy acc to vreg[122] +v_accvgpr_read_b32 v[vgprValuC+56], acc237 // copy acc to vreg[123] +v_accvgpr_read_b32 v[vgprValuC+57], acc241 // copy acc to vreg[124] +v_accvgpr_read_b32 v[vgprValuC+58], acc245 // copy acc to vreg[125] +v_accvgpr_read_b32 v[vgprValuC+59], acc249 // copy acc to vreg[126] +v_accvgpr_read_b32 v[vgprValuC+60], acc253 // copy acc to vreg[127] +v_accvgpr_read_b32 v[vgprValuC+61], acc2 // copy acc to vreg[128] +v_accvgpr_read_b32 v[vgprValuC+62], acc6 // copy acc to vreg[129] +v_accvgpr_read_b32 v[vgprValuC+63], acc10 // copy acc to vreg[130] +v_accvgpr_read_b32 v[vgprValuC+64], acc14 // copy acc to vreg[131] +v_accvgpr_read_b32 v[vgprValuC+65], acc18 // copy acc to vreg[132] +v_accvgpr_read_b32 v[vgprValuC+66], acc22 // copy acc to vreg[133] +v_accvgpr_read_b32 v[vgprValuC+67], acc26 // copy acc to vreg[134] +v_accvgpr_read_b32 v[vgprValuC+68], acc30 // copy acc to vreg[135] +v_accvgpr_read_b32 v[vgprValuC+69], acc34 // copy acc to vreg[136] +v_accvgpr_read_b32 v[vgprValuC+70], acc38 // copy acc to vreg[137] +v_accvgpr_read_b32 v[vgprValuC+71], acc42 // copy acc to vreg[138] +v_accvgpr_read_b32 v[vgprValuC+72], acc46 // copy acc to vreg[139] +v_accvgpr_read_b32 v[vgprValuC+73], acc50 // copy acc to vreg[140] +v_accvgpr_read_b32 v[vgprValuC+74], acc54 // copy acc to vreg[141] +v_accvgpr_read_b32 v[vgprValuC+75], acc58 // copy acc to vreg[142] +v_accvgpr_read_b32 v[vgprValuC+76], acc62 // copy acc to vreg[143] +v_accvgpr_read_b32 v[vgprValuC+77], acc66 // copy acc to vreg[144] +v_accvgpr_read_b32 v[vgprValuC+78], acc70 // copy acc to vreg[145] +v_accvgpr_read_b32 v[vgprValuC+79], acc74 // copy acc to vreg[146] +v_accvgpr_read_b32 v[vgprValuC+80], acc78 // copy acc to vreg[147] +v_accvgpr_read_b32 v[vgprValuC+81], acc82 // copy acc to vreg[148] +v_accvgpr_read_b32 v[vgprValuC+82], acc86 // copy acc to vreg[149] +v_accvgpr_read_b32 v[vgprValuC+83], acc90 // copy acc to vreg[150] +v_accvgpr_read_b32 v[vgprValuC+84], acc94 // copy acc to vreg[151] +v_accvgpr_read_b32 v[vgprValuC+85], acc98 // copy acc to vreg[152] +v_accvgpr_read_b32 v[vgprValuC+86], acc102 // copy acc to vreg[153] +v_accvgpr_read_b32 v[vgprValuC+87], acc106 // copy acc to vreg[154] +v_accvgpr_read_b32 v[vgprValuC+88], acc110 // copy acc to vreg[155] + +/* rC *= alpha batchElements=[(0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+88], s[sgprAlpha], v[vgprValuC+88] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_fma_mix_f32 v[vgprValuC+11], s[sgprBeta], v89, v[vgprValuC+11] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+12], s[sgprBeta], v91, v[vgprValuC+12] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+13], s[sgprBeta], v93, v[vgprValuC+13] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+14], s[sgprBeta], v95, v[vgprValuC+14] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+15], s[sgprBeta], v97, v[vgprValuC+15] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v99, v[vgprValuC+16] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v101, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v103, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v105, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v107, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v109, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v111, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v113, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v115, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v117, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v119, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v121, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v123, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v125, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v127, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v129, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v131, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v136, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 +buffer_store_short v33, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v138, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 +buffer_store_short v34, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v140, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 +buffer_store_short v35, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v142, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 +buffer_store_short v36, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v144, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 +buffer_store_short v37, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v146, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 +buffer_store_short v38, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v148, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 +buffer_store_short v39, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v150, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 +buffer_store_short v40, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v152, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 +buffer_store_short v41, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v154, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 +buffer_store_short v42, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v156, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 +buffer_store_short v43, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v158, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 +buffer_store_short v44, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v160, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 +buffer_store_short v45, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v162, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 +buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v166, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 +buffer_store_short v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v168, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 +buffer_store_short v49, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v170, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 +buffer_store_short v50, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v172, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 +buffer_store_short v51, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v174, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 +buffer_store_short v52, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v176, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 +buffer_store_short v53, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v178, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 +buffer_store_short v54, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v180, v[vgprValuC+55] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 +buffer_store_short v55, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v182, v[vgprValuC+56] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 +buffer_store_short v56, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v184, v[vgprValuC+57] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 +buffer_store_short v57, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v186, v[vgprValuC+58] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 +buffer_store_short v58, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v188, v[vgprValuC+59] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 +buffer_store_short v59, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v190, v[vgprValuC+60] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 +buffer_store_short v60, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v192, v[vgprValuC+61] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v61, v[vgprValuC+61] // convert C to fp16 +buffer_store_short v61, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v194, v[vgprValuC+62] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v62, v[vgprValuC+62] // convert C to fp16 +buffer_store_short v62, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v196, v[vgprValuC+63] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v63, v[vgprValuC+63] // convert C to fp16 +buffer_store_short v63, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v198, v[vgprValuC+64] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v64, v[vgprValuC+64] // convert C to fp16 +buffer_store_short v64, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v200, v[vgprValuC+65] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v65, v[vgprValuC+65] // convert C to fp16 +buffer_store_short v65, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v202, v[vgprValuC+66] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v66, v[vgprValuC+66] // convert C to fp16 +buffer_store_short v66, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v204, v[vgprValuC+67] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v67, v[vgprValuC+67] // convert C to fp16 +buffer_store_short v67, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v206, v[vgprValuC+68] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v68, v[vgprValuC+68] // convert C to fp16 +buffer_store_short v68, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v208, v[vgprValuC+69] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v69, v[vgprValuC+69] // convert C to fp16 +buffer_store_short v69, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v210, v[vgprValuC+70] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v70, v[vgprValuC+70] // convert C to fp16 +buffer_store_short v70, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v212, v[vgprValuC+71] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v71, v[vgprValuC+71] // convert C to fp16 +buffer_store_short v71, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v214, v[vgprValuC+72] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v72, v[vgprValuC+72] // convert C to fp16 +buffer_store_short v72, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v216, v[vgprValuC+73] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v73, v[vgprValuC+73] // convert C to fp16 +buffer_store_short v73, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v218, v[vgprValuC+74] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v74, v[vgprValuC+74] // convert C to fp16 +buffer_store_short v74, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v220, v[vgprValuC+75] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v75, v[vgprValuC+75] // convert C to fp16 +buffer_store_short v75, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v222, v[vgprValuC+76] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v76, v[vgprValuC+76] // convert C to fp16 +buffer_store_short v76, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v224, v[vgprValuC+77] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v77, v[vgprValuC+77] // convert C to fp16 +buffer_store_short v77, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v226, v[vgprValuC+78] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v78, v[vgprValuC+78] // convert C to fp16 +buffer_store_short v78, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v228, v[vgprValuC+79] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v79, v[vgprValuC+79] // convert C to fp16 +buffer_store_short v79, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v230, v[vgprValuC+80] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v80, v[vgprValuC+80] // convert C to fp16 +buffer_store_short v80, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v232, v[vgprValuC+81] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v81, v[vgprValuC+81] // convert C to fp16 +buffer_store_short v81, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v234, v[vgprValuC+82] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v82, v[vgprValuC+82] // convert C to fp16 +buffer_store_short v82, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v236, v[vgprValuC+83] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v83, v[vgprValuC+83] // convert C to fp16 +buffer_store_short v83, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v238, v[vgprValuC+84] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v84, v[vgprValuC+84] // convert C to fp16 +buffer_store_short v84, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v240, v[vgprValuC+85] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v85, v[vgprValuC+85] // convert C to fp16 +buffer_store_short v85, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v242, v[vgprValuC+86] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v86, v[vgprValuC+86] // convert C to fp16 +buffer_store_short v86, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v244, v[vgprValuC+87] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v87, v[vgprValuC+87] // convert C to fp16 +buffer_store_short v87, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v246, v[vgprValuC+88] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v88, v[vgprValuC+88] // convert C to fp16 +buffer_store_short v88, v247, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */ +/* (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1); (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,19,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v89, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v90, v10, v90, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v92, v10, v92, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v94, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v93, v94, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v94, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v94, v10, v94, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,19,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v96, v10, v96, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v98, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v98, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v98, v10, v98, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v100, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v99, v100, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v100, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v100, v10, v100, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v102, v10, v102, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v104, v10, v104, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v106, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v105, v106, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v106, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v106, v10, v106, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v108, v10, v108, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v110, v10, v110, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,20,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v112, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v111, v112, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v112, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v112, v10, v112, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v114, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v114, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v114, v10, v114, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v116, v10, v116, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v118, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v117, v118, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v118, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v118, v10, v118, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v120, v10, v120, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v122, v10, v122, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v124, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v123, v124, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v124, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v124, v10, v124, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v126, v10, v126, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,21,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v128, v10, v128, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v130, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v129, v130, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v130, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v130, v10, v130, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v135, v10, v135, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v137, v10, v137, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v139, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v138, v139, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v139, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v139, v10, v139, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v141, v10, v141, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v143, v10, v143, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v145, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v144, v145, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v145, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v145, v10, v145, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,22,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v147, v10, v147, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v149, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v149, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v149, v10, v149, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v151, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v150, v151, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v151, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v151, v10, v151, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v153, v10, v153, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v155, v10, v155, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v157, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v156, v157, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v157, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v157, v10, v157, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v159, v10, v159, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v161, v10, v161, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,23,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v163, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v162, v163, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v163, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v163, v10, v163, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v165, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v165, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v165, v10, v165, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v167, v10, v167, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v169, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v168, v169, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v169, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v169, v10, v169, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v171, v10, v171, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v173, v10, v173, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v175, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v174, v175, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v175, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v175, v10, v175, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v177, v10, v177, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,24,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v179, v10, v179, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v181, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v180, v181, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v181, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v181, v10, v181, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v183, v10, v183, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v185, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v184, v185, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v185, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v185, v10, v185, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v187, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v186, v187, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v187, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v187, v10, v187, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v189, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v188, v189, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v189, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v189, v10, v189, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v191, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v190, v191, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v191, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v191, v10, v191, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v193, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v192, v193, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v193, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v193, v10, v193, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,25,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v195, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v194, v195, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v195, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v195, v10, v195, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v197, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v196, v197, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v197, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v197, v10, v197, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v199, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v198, v199, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v199, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v199, v10, v199, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v201, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v200, v201, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v201, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v201, v10, v201, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v203, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v202, v203, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v203, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v203, v10, v203, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v205, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v204, v205, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v205, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v205, v10, v205, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v207, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v206, v207, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v207, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v207, v10, v207, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v209, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v208, v209, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v209, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v209, v10, v209, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,26,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v211, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v210, v211, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v211, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v211, v10, v211, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v213, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v212, v213, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v213, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v213, v10, v213, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v215, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v214, v215, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v215, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v215, v10, v215, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v217, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v216, v217, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v217, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v217, v10, v217, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v219, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v218, v219, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v219, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v219, v10, v219, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v221, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v220, v221, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v221, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v221, v10, v221, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v223, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v222, v223, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v223, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v223, v10, v223, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v225, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v224, v225, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v225, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v225, v10, v225, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,27,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v227, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v226, v227, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v227, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v227, v10, v227, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v229, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v228, v229, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v229, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v229, v10, v229, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v231, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v230, v231, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v231, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v231, v10, v231, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v233, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v232, v233, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v233, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v233, v10, v233, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v235, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v234, v235, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v235, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v235, v10, v235, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v237, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v236, v237, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v237, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v237, v10, v237, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v239, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v238, v239, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v239, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v239, v10, v239, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v241, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v240, v241, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v241, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v241, v10, v241, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,28,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v243, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v242, v243, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v243, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v243, v10, v243, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v245, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v244, v245, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v245, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v245, v10, v245, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v247, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v247, v10, v247, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v246, v247, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v247, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v247, v10, v247, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc114 // copy acc to vreg[156] +v_accvgpr_read_b32 v[vgprValuC+12], acc118 // copy acc to vreg[157] +v_accvgpr_read_b32 v[vgprValuC+13], acc122 // copy acc to vreg[158] +v_accvgpr_read_b32 v[vgprValuC+14], acc126 // copy acc to vreg[159] +v_accvgpr_read_b32 v[vgprValuC+15], acc130 // copy acc to vreg[160] +v_accvgpr_read_b32 v[vgprValuC+16], acc134 // copy acc to vreg[161] +v_accvgpr_read_b32 v[vgprValuC+17], acc138 // copy acc to vreg[162] +v_accvgpr_read_b32 v[vgprValuC+18], acc142 // copy acc to vreg[163] +v_accvgpr_read_b32 v[vgprValuC+19], acc146 // copy acc to vreg[164] +v_accvgpr_read_b32 v[vgprValuC+20], acc150 // copy acc to vreg[165] +v_accvgpr_read_b32 v[vgprValuC+21], acc154 // copy acc to vreg[166] +v_accvgpr_read_b32 v[vgprValuC+22], acc158 // copy acc to vreg[167] +v_accvgpr_read_b32 v[vgprValuC+23], acc162 // copy acc to vreg[168] +v_accvgpr_read_b32 v[vgprValuC+24], acc166 // copy acc to vreg[169] +v_accvgpr_read_b32 v[vgprValuC+25], acc170 // copy acc to vreg[170] +v_accvgpr_read_b32 v[vgprValuC+26], acc174 // copy acc to vreg[171] +v_accvgpr_read_b32 v[vgprValuC+27], acc178 // copy acc to vreg[172] +v_accvgpr_read_b32 v[vgprValuC+28], acc182 // copy acc to vreg[173] +v_accvgpr_read_b32 v[vgprValuC+29], acc186 // copy acc to vreg[174] +v_accvgpr_read_b32 v[vgprValuC+30], acc190 // copy acc to vreg[175] +v_accvgpr_read_b32 v[vgprValuC+31], acc194 // copy acc to vreg[176] +v_accvgpr_read_b32 v[vgprValuC+32], acc198 // copy acc to vreg[177] +v_accvgpr_read_b32 v[vgprValuC+33], acc202 // copy acc to vreg[178] +v_accvgpr_read_b32 v[vgprValuC+34], acc206 // copy acc to vreg[179] +v_accvgpr_read_b32 v[vgprValuC+35], acc210 // copy acc to vreg[180] +v_accvgpr_read_b32 v[vgprValuC+36], acc214 // copy acc to vreg[181] +v_accvgpr_read_b32 v[vgprValuC+37], acc218 // copy acc to vreg[182] +v_accvgpr_read_b32 v[vgprValuC+38], acc222 // copy acc to vreg[183] +v_accvgpr_read_b32 v[vgprValuC+39], acc226 // copy acc to vreg[184] +v_accvgpr_read_b32 v[vgprValuC+40], acc230 // copy acc to vreg[185] +v_accvgpr_read_b32 v[vgprValuC+41], acc234 // copy acc to vreg[186] +v_accvgpr_read_b32 v[vgprValuC+42], acc238 // copy acc to vreg[187] +v_accvgpr_read_b32 v[vgprValuC+43], acc242 // copy acc to vreg[188] +v_accvgpr_read_b32 v[vgprValuC+44], acc246 // copy acc to vreg[189] +v_accvgpr_read_b32 v[vgprValuC+45], acc250 // copy acc to vreg[190] +v_accvgpr_read_b32 v[vgprValuC+46], acc254 // copy acc to vreg[191] +v_accvgpr_read_b32 v[vgprValuC+47], acc3 // copy acc to vreg[192] +v_accvgpr_read_b32 v[vgprValuC+48], acc7 // copy acc to vreg[193] +v_accvgpr_read_b32 v[vgprValuC+49], acc11 // copy acc to vreg[194] +v_accvgpr_read_b32 v[vgprValuC+50], acc15 // copy acc to vreg[195] +v_accvgpr_read_b32 v[vgprValuC+51], acc19 // copy acc to vreg[196] +v_accvgpr_read_b32 v[vgprValuC+52], acc23 // copy acc to vreg[197] +v_accvgpr_read_b32 v[vgprValuC+53], acc27 // copy acc to vreg[198] +v_accvgpr_read_b32 v[vgprValuC+54], acc31 // copy acc to vreg[199] +v_accvgpr_read_b32 v[vgprValuC+55], acc35 // copy acc to vreg[200] +v_accvgpr_read_b32 v[vgprValuC+56], acc39 // copy acc to vreg[201] +v_accvgpr_read_b32 v[vgprValuC+57], acc43 // copy acc to vreg[202] +v_accvgpr_read_b32 v[vgprValuC+58], acc47 // copy acc to vreg[203] +v_accvgpr_read_b32 v[vgprValuC+59], acc51 // copy acc to vreg[204] +v_accvgpr_read_b32 v[vgprValuC+60], acc55 // copy acc to vreg[205] +v_accvgpr_read_b32 v[vgprValuC+61], acc59 // copy acc to vreg[206] +v_accvgpr_read_b32 v[vgprValuC+62], acc63 // copy acc to vreg[207] +v_accvgpr_read_b32 v[vgprValuC+63], acc67 // copy acc to vreg[208] +v_accvgpr_read_b32 v[vgprValuC+64], acc71 // copy acc to vreg[209] +v_accvgpr_read_b32 v[vgprValuC+65], acc75 // copy acc to vreg[210] +v_accvgpr_read_b32 v[vgprValuC+66], acc79 // copy acc to vreg[211] +v_accvgpr_read_b32 v[vgprValuC+67], acc83 // copy acc to vreg[212] +v_accvgpr_read_b32 v[vgprValuC+68], acc87 // copy acc to vreg[213] +v_accvgpr_read_b32 v[vgprValuC+69], acc91 // copy acc to vreg[214] +v_accvgpr_read_b32 v[vgprValuC+70], acc95 // copy acc to vreg[215] +v_accvgpr_read_b32 v[vgprValuC+71], acc99 // copy acc to vreg[216] +v_accvgpr_read_b32 v[vgprValuC+72], acc103 // copy acc to vreg[217] +v_accvgpr_read_b32 v[vgprValuC+73], acc107 // copy acc to vreg[218] +v_accvgpr_read_b32 v[vgprValuC+74], acc111 // copy acc to vreg[219] +v_accvgpr_read_b32 v[vgprValuC+75], acc115 // copy acc to vreg[220] +v_accvgpr_read_b32 v[vgprValuC+76], acc119 // copy acc to vreg[221] +v_accvgpr_read_b32 v[vgprValuC+77], acc123 // copy acc to vreg[222] +v_accvgpr_read_b32 v[vgprValuC+78], acc127 // copy acc to vreg[223] +v_accvgpr_read_b32 v[vgprValuC+79], acc131 // copy acc to vreg[224] +v_accvgpr_read_b32 v[vgprValuC+80], acc135 // copy acc to vreg[225] +v_accvgpr_read_b32 v[vgprValuC+81], acc139 // copy acc to vreg[226] +v_accvgpr_read_b32 v[vgprValuC+82], acc143 // copy acc to vreg[227] +v_accvgpr_read_b32 v[vgprValuC+83], acc147 // copy acc to vreg[228] +v_accvgpr_read_b32 v[vgprValuC+84], acc151 // copy acc to vreg[229] +v_accvgpr_read_b32 v[vgprValuC+85], acc155 // copy acc to vreg[230] +v_accvgpr_read_b32 v[vgprValuC+86], acc159 // copy acc to vreg[231] +v_accvgpr_read_b32 v[vgprValuC+87], acc163 // copy acc to vreg[232] +v_accvgpr_read_b32 v[vgprValuC+88], acc167 // copy acc to vreg[233] + +/* rC *= alpha batchElements=[(0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3), (0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+88], s[sgprAlpha], v[vgprValuC+88] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_fma_mix_f32 v[vgprValuC+11], s[sgprBeta], v89, v[vgprValuC+11] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+12], s[sgprBeta], v91, v[vgprValuC+12] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+13], s[sgprBeta], v93, v[vgprValuC+13] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+14], s[sgprBeta], v95, v[vgprValuC+14] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+15], s[sgprBeta], v97, v[vgprValuC+15] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v99, v[vgprValuC+16] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v101, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v103, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v105, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v107, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v109, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v111, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v113, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v115, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v117, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v119, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v121, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v123, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v125, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v127, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v129, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v131, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v136, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 +buffer_store_short v33, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v138, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 +buffer_store_short v34, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v140, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 +buffer_store_short v35, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v142, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 +buffer_store_short v36, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v144, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 +buffer_store_short v37, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v146, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 +buffer_store_short v38, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v148, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 +buffer_store_short v39, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v150, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 +buffer_store_short v40, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v152, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 +buffer_store_short v41, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v154, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 +buffer_store_short v42, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v156, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 +buffer_store_short v43, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v158, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 +buffer_store_short v44, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v160, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 +buffer_store_short v45, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v162, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 +buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 +buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v166, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 +buffer_store_short v48, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v168, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 +buffer_store_short v49, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v170, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 +buffer_store_short v50, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v172, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 +buffer_store_short v51, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v174, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 +buffer_store_short v52, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v176, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 +buffer_store_short v53, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v178, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 +buffer_store_short v54, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+55], s[sgprBeta], v180, v[vgprValuC+55] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 +buffer_store_short v55, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+56], s[sgprBeta], v182, v[vgprValuC+56] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 +buffer_store_short v56, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+57], s[sgprBeta], v184, v[vgprValuC+57] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 +buffer_store_short v57, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+58], s[sgprBeta], v186, v[vgprValuC+58] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 +buffer_store_short v58, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+59], s[sgprBeta], v188, v[vgprValuC+59] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 +buffer_store_short v59, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+60], s[sgprBeta], v190, v[vgprValuC+60] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 +buffer_store_short v60, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+61], s[sgprBeta], v192, v[vgprValuC+61] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v61, v[vgprValuC+61] // convert C to fp16 +buffer_store_short v61, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+62], s[sgprBeta], v194, v[vgprValuC+62] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v62, v[vgprValuC+62] // convert C to fp16 +buffer_store_short v62, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+63], s[sgprBeta], v196, v[vgprValuC+63] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v63, v[vgprValuC+63] // convert C to fp16 +buffer_store_short v63, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+64], s[sgprBeta], v198, v[vgprValuC+64] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v64, v[vgprValuC+64] // convert C to fp16 +buffer_store_short v64, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+65], s[sgprBeta], v200, v[vgprValuC+65] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v65, v[vgprValuC+65] // convert C to fp16 +buffer_store_short v65, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+66], s[sgprBeta], v202, v[vgprValuC+66] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v66, v[vgprValuC+66] // convert C to fp16 +buffer_store_short v66, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+67], s[sgprBeta], v204, v[vgprValuC+67] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v67, v[vgprValuC+67] // convert C to fp16 +buffer_store_short v67, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+68], s[sgprBeta], v206, v[vgprValuC+68] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v68, v[vgprValuC+68] // convert C to fp16 +buffer_store_short v68, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+69], s[sgprBeta], v208, v[vgprValuC+69] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v69, v[vgprValuC+69] // convert C to fp16 +buffer_store_short v69, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+70], s[sgprBeta], v210, v[vgprValuC+70] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v70, v[vgprValuC+70] // convert C to fp16 +buffer_store_short v70, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+71], s[sgprBeta], v212, v[vgprValuC+71] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v71, v[vgprValuC+71] // convert C to fp16 +buffer_store_short v71, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+72], s[sgprBeta], v214, v[vgprValuC+72] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v72, v[vgprValuC+72] // convert C to fp16 +buffer_store_short v72, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+73], s[sgprBeta], v216, v[vgprValuC+73] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v73, v[vgprValuC+73] // convert C to fp16 +buffer_store_short v73, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+74], s[sgprBeta], v218, v[vgprValuC+74] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v74, v[vgprValuC+74] // convert C to fp16 +buffer_store_short v74, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+75], s[sgprBeta], v220, v[vgprValuC+75] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v75, v[vgprValuC+75] // convert C to fp16 +buffer_store_short v75, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+76], s[sgprBeta], v222, v[vgprValuC+76] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v76, v[vgprValuC+76] // convert C to fp16 +buffer_store_short v76, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+77], s[sgprBeta], v224, v[vgprValuC+77] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v77, v[vgprValuC+77] // convert C to fp16 +buffer_store_short v77, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+78], s[sgprBeta], v226, v[vgprValuC+78] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v78, v[vgprValuC+78] // convert C to fp16 +buffer_store_short v78, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+79], s[sgprBeta], v228, v[vgprValuC+79] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v79, v[vgprValuC+79] // convert C to fp16 +buffer_store_short v79, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+80], s[sgprBeta], v230, v[vgprValuC+80] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v80, v[vgprValuC+80] // convert C to fp16 +buffer_store_short v80, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+81], s[sgprBeta], v232, v[vgprValuC+81] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v81, v[vgprValuC+81] // convert C to fp16 +buffer_store_short v81, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+82], s[sgprBeta], v234, v[vgprValuC+82] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v82, v[vgprValuC+82] // convert C to fp16 +buffer_store_short v82, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+83], s[sgprBeta], v236, v[vgprValuC+83] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v83, v[vgprValuC+83] // convert C to fp16 +buffer_store_short v83, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+84], s[sgprBeta], v238, v[vgprValuC+84] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v84, v[vgprValuC+84] // convert C to fp16 +buffer_store_short v84, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+85], s[sgprBeta], v240, v[vgprValuC+85] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v85, v[vgprValuC+85] // convert C to fp16 +buffer_store_short v85, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+86], s[sgprBeta], v242, v[vgprValuC+86] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v86, v[vgprValuC+86] // convert C to fp16 +buffer_store_short v86, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+87], s[sgprBeta], v244, v[vgprValuC+87] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v87, v[vgprValuC+87] // convert C to fp16 +buffer_store_short v87, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+88], s[sgprBeta], v246, v[vgprValuC+88] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v88, v[vgprValuC+88] // convert C to fp16 +buffer_store_short v88, v247, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */ + +/******************************************/ +/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */ +/* (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */ +/******************************************/ + +/* calc coords, apply mask, and issue loads (if necessary) */ +v_mov_b32 v10, BufferOOB +/* (d1,vc1,d0,vc0)=(0,29,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v34, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v34, v10, v34, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v33, v34, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v34, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v34, v10, v34, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v36, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v36, v10, v36, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v35, v36, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v36, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v36, v10, v36, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v38, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v38, v10, v38, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v37, v38, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v38, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v38, v10, v38, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v40, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v40, v10, v40, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v39, v40, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v40, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v40, v10, v40, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v42, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v42, v10, v42, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v41, v42, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v42, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v42, v10, v42, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,29,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v44, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v43, v44, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v44, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v44, v10, v44, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v46, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v45, v46, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v46, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v46, v10, v46, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v48, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v47, v48, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v48, v10, v48, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v50, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v49, v50, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v50, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v50, v10, v50, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v52, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v51, v52, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v52, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v52, v10, v52, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v54, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v53, v54, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v54, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v54, v10, v54, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v56, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v55, v56, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v56, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v56, v10, v56, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v57, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v58, v10, v58, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,30,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v60, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v59, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v60, v10, v60, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,0) */ +v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 + +/* Fix for UseInitialStridesCD, emitAddressSetupCode */ +v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row +v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row +v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v62, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v61, v62, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v62, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v62, v10, v62, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,1) */ +v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v64, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v63, v64, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v64, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v64, v10, v64, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,2) */ +v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v66, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v65, v66, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v66, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v66, v10, v66, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,3) */ +v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v68, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v67, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v68, v10, v68, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,4) */ +v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v69, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v70, v10, v70, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,5) */ +v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v72, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v72, v10, v72, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v71, v72, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v72, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v72, v10, v72, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,6) */ +v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v74, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v74, v10, v74, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16 v73, v74, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v74, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v74, v10, v74, s[34:35] // LDD clip if OOB. offset +/* (d1,vc1,d0,vc0)=(0,31,0,7) */ +v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 +v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[34:35], s[30:31], s[34:35] // in0 && in1 +v_add_lshl_u32 v76, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v76, v10, v76, s[34:35] // LDC clip if OOB. offset +buffer_load_short_d16_hi v75, v76, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C +v_add_lshl_u32 v76, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr +v_cndmask_b32 v76, v10, v76, s[34:35] // LDD clip if OOB. offset +v_accvgpr_read_b32 v[vgprValuC+11], acc171 // copy acc to vreg[234] +v_accvgpr_read_b32 v[vgprValuC+12], acc175 // copy acc to vreg[235] +v_accvgpr_read_b32 v[vgprValuC+13], acc179 // copy acc to vreg[236] +v_accvgpr_read_b32 v[vgprValuC+14], acc183 // copy acc to vreg[237] +v_accvgpr_read_b32 v[vgprValuC+15], acc187 // copy acc to vreg[238] +v_accvgpr_read_b32 v[vgprValuC+16], acc191 // copy acc to vreg[239] +v_accvgpr_read_b32 v[vgprValuC+17], acc195 // copy acc to vreg[240] +v_accvgpr_read_b32 v[vgprValuC+18], acc199 // copy acc to vreg[241] +v_accvgpr_read_b32 v[vgprValuC+19], acc203 // copy acc to vreg[242] +v_accvgpr_read_b32 v[vgprValuC+20], acc207 // copy acc to vreg[243] +v_accvgpr_read_b32 v[vgprValuC+21], acc211 // copy acc to vreg[244] +v_accvgpr_read_b32 v[vgprValuC+22], acc215 // copy acc to vreg[245] +v_accvgpr_read_b32 v[vgprValuC+23], acc219 // copy acc to vreg[246] +v_accvgpr_read_b32 v[vgprValuC+24], acc223 // copy acc to vreg[247] +v_accvgpr_read_b32 v[vgprValuC+25], acc227 // copy acc to vreg[248] +v_accvgpr_read_b32 v[vgprValuC+26], acc231 // copy acc to vreg[249] +v_accvgpr_read_b32 v[vgprValuC+27], acc235 // copy acc to vreg[250] +v_accvgpr_read_b32 v[vgprValuC+28], acc239 // copy acc to vreg[251] +v_accvgpr_read_b32 v[vgprValuC+29], acc243 // copy acc to vreg[252] +v_accvgpr_read_b32 v[vgprValuC+30], acc247 // copy acc to vreg[253] +v_accvgpr_read_b32 v[vgprValuC+31], acc251 // copy acc to vreg[254] +v_accvgpr_read_b32 v[vgprValuC+32], acc255 // copy acc to vreg[255] + +/* rC *= alpha batchElements=[(0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */ +v_mul_f32 v[vgprValuC+11], s[sgprAlpha], v[vgprValuC+11] // *= alpha +v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk) +v_mul_f32 v[vgprValuC+32], s[sgprAlpha], v[vgprValuC+32] // *= alpha +s_waitcnt vmcnt(0) // wait for Beta + +/* apply mask, calc new C and issue writes */ +v_fma_mix_f32 v[vgprValuC+11], s[sgprBeta], v33, v[vgprValuC+11] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v11, v[vgprValuC+11] // convert C to fp16 +buffer_store_short v11, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+12], s[sgprBeta], v35, v[vgprValuC+12] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v12, v[vgprValuC+12] // convert C to fp16 +buffer_store_short v12, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+13], s[sgprBeta], v37, v[vgprValuC+13] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v13, v[vgprValuC+13] // convert C to fp16 +buffer_store_short v13, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+14], s[sgprBeta], v39, v[vgprValuC+14] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v14, v[vgprValuC+14] // convert C to fp16 +buffer_store_short v14, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+15], s[sgprBeta], v41, v[vgprValuC+15] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v15, v[vgprValuC+15] // convert C to fp16 +buffer_store_short v15, v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+16], s[sgprBeta], v43, v[vgprValuC+16] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v16, v[vgprValuC+16] // convert C to fp16 +buffer_store_short v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v45, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 +buffer_store_short v17, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v47, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 +buffer_store_short v18, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v49, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 +buffer_store_short v19, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v51, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 +buffer_store_short v20, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v53, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 +buffer_store_short v21, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v55, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 +buffer_store_short v22, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v57, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 +buffer_store_short v23, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v59, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 +buffer_store_short v24, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v61, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 +buffer_store_short v25, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v63, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 +buffer_store_short v26, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v65, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 +buffer_store_short v27, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v67, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 +buffer_store_short v28, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v69, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 +buffer_store_short v29, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v71, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 +buffer_store_short v30, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v73, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 +buffer_store_short v31, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v75, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta +v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 +buffer_store_short v32, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D +s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst +s_branch label_GW_End_2 // jump to end +label_GW_End_2: +label_KernelEnd: +s_endpgm // Kernel End +label_ASM_End: /// The end of the kernel