diff --git a/clients/gtest/matmul_gtest.yaml b/clients/gtest/matmul_gtest.yaml index 43fb5637ff..635261f9ac 100755 --- a/clients/gtest/matmul_gtest.yaml +++ b/clients/gtest/matmul_gtest.yaml @@ -697,6 +697,48 @@ Tests: unit_check: 1 gpu_arch: '94[0-2]' +#TODO: extend to all f8 transpose and datatype if necessary +- name: matmul_real_f8_dst_f8_TT + category: pre_checkin + function: + matmul: *f8_precision_dst_f8 + matrix_size: + - { M: 1, N: 1, K: 1 } + - { M: 1, N: 1, K: 127 } + - { M: 1, N: 127, K: 127 } + - { M: 2, N: 127, K: 127 } + - { M: 3, N: 127, K: 127 } + - { M: 127, N: 1, K: 127 } + - { M: 127, N: 2, K: 127 } + - { M: 127, N: 3, K: 127 } + - { M: 1, N: 1, K: 128 } + - { M: 1, N: 128, K: 128 } + - { M: 2, N: 128, K: 128 } + - { M: 3, N: 128, K: 128 } + - { M: 128, N: 1, K: 128 } + - { M: 128, N: 2, K: 128 } + - { M: 128, N: 3, K: 128 } + - { M: 1, N: 1, K: 129 } + - { M: 1, N: 129, K: 129 } + - { M: 2, N: 129, K: 129 } + - { M: 3, N: 129, K: 129 } + - { M: 129, N: 1, K: 129 } + - { M: 129, N: 2, K: 129 } + - { M: 129, N: 3, K: 129 } + transA: T + transB: T + alpha: 1 + beta: [ 0.0, 2.0 ] + scaleA: [0, 1] + scaleB: [0, 1] + scaleC: [0] + scaleD: [0] + bias_vector: [0, 1] + bias_type: f32_r + unit_check: 1 + gpu_arch: '942' + + - name: matmul_real_1b_dst_f8_SCDNotInt category: pre_checkin function: diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs.yaml index a0c24592ae..a9f2b994c1 100644 --- a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs.yaml +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs.yaml @@ -70,7 +70,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: true - - 1LDSBuffer: 0 @@ -275,7 +275,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: true ScheduleGlobalRead: 1 @@ -534,7 +534,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: true ScheduleGlobalRead: 1 @@ -793,7 +793,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: true ScheduleGlobalRead: 1 @@ -850,15 +850,283 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 2 + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 0 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_MIWT1_1 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumBytes: 3392 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 768 + LdsOffsetMetadata_Blk: 2816 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 11 + DataTypeA: 11 + DataTypeB: 11 + DataTypeE: 11 + DestDataType: 11 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_GSU1_MIWT1_1_SU0_SUM0_SUS0_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 0 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 - [2, 3, 0, 1] -- - - [1, 128, 1, 128, 1, 1, 128, 128] +- - - [1, 128, 1, 128] - [2, 2.80861] - - - [128, 128, 1, 128, 128, 128, 128, 128] + - - [128, 128, 1, 128] - [0, 237.234] - - - [127, 127, 1, 128, 127, 127, 128, 127] + - - [127, 127, 1, 128] - [1, 220.532] - - - [129, 129, 1, 128, 129, 129, 129, 129] + - - [129, 129, 1, 128] - [2, 217.741] + - - [3, 3, 1, 3] + - [3, 0.0] - null - null - DeviceEfficiency diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_AS_SAB_SAV_UserArgs.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_AS_SAB_SAV_UserArgs.yaml index 86921d644b..3ec6fcac1f 100644 --- a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_AS_SAB_SAV_UserArgs.yaml +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_AS_SAB_SAV_UserArgs.yaml @@ -70,7 +70,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: false - - 1LDSBuffer: 0 @@ -275,7 +275,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: false ScheduleGlobalRead: 1 @@ -534,7 +534,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: false ScheduleGlobalRead: 1 @@ -793,7 +793,7 @@ UseE: false UseInitialStridesAB: false UseInitialStridesCD: false - UseScaleAB: "Scalar" + UseScaleAB: Scalar UseScaleAlphaVec: 1 UseScaleCD: false ScheduleGlobalRead: 1 @@ -850,15 +850,283 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 2 + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 0 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_AS_SAB_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_MIWT1_1 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumBytes: 3392 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 768 + LdsOffsetMetadata_Blk: 2816 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: true + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 11 + DataTypeA: 11 + DataTypeB: 11 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_AS_SAB_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_GSU1_MIWT1_1_SU0_SUM0_SUS0_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 0 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 - [2, 3, 0, 1] -- - - [1, 128, 1, 128, 1, 1, 128, 128] +- - - [1, 128, 1, 128] - [2, 2.81827] - - - [128, 128, 1, 128, 128, 128, 128, 128] + - - [128, 128, 1, 128] - [0, 236.699] - - - [127, 127, 1, 128, 127, 127, 128, 127] + - - [127, 127, 1, 128] - [1, 238.396] - - - [129, 129, 1, 128, 129, 129, 129, 129] + - - [129, 129, 1, 128] - [2, 228.02] + - - [3, 3, 1, 3] + - [3, 0.0] - null - null - DeviceEfficiency